Exploratory Data Analysis

Data organisation

Folder structure:

data/
    iris-dataset/
      dataset.db
    mnist/
      train-test-seed-1.db
      train-test-seed-7.db
      images/
    coco/
      subset-1.db
      coco-full.db
      images/
      annotations/

Each db can have tables like:

idx split class ... path label
0 train cat ... images/cat-1.png 1
1 val dog ... images/dog-1.png 2
...

Advantages of using DataFrame:

  • dataset is 'cached', avoiding surprises with random sampling
  • does not need to be indexed each time, so loads faster
  • stores any extra metadata needed per row (e.g. mean, std per image)
  • easy to define __getitem__(...) to create torch...Dataset
  • allows for interactive exploration

Helpers to write/read sqlite3 db files

import sqlite3
from pathlib import Path
from typing import Literal

import pandas as pd

def save_db(
    df: pd.DataFrame,
    db_path: Path | str,
    table: str,
    if_exists: Literal["fail", "replace", "append"] = "replace",
    index: bool = False,
    verbose: bool = False,
) -> None:
    """Save DataFrame to sqlite3 db as specified table.
    NOTE : default behaviour is to replace the table if it exists."""
    conn = sqlite3.connect(str(db_path))

    if len(df.columns) == 0:
        print(f"Cannot save empty df ->\n{df}")
        return

    df.to_sql(table, conn, if_exists=if_exists, index=index)
    if verbose:
        print(df)
        print(df.columns)
        print(f"Saved to => {table} => {db_path}")


def load_db(
    db_path: Path | str,
    table: str,
    verbose: bool = False,
) -> pd.DataFrame:
    """Read sqlite3 db, return all rows from specified table."""
    conn = sqlite3.connect(str(db_path))
    df = pd.read_sql_query(f"SELECT * from '{table}'", conn)
    if verbose:
        print(df)
        print(df.columns)
        print(f"Loaded from => {table} => {db_path}")
    return df

Interactive Exploration