Add support for combined datasets and analysis

This commit is contained in:
Jai Parera
2025-06-11 20:34:09 -07:00
parent 541538fcfe
commit 24fc2ed6f7
11 changed files with 1082 additions and 5 deletions

74
combine.py Normal file
View File

@@ -0,0 +1,74 @@
#!/usr/bin/env python3
"""combined.py
Concatenate every CSV that matches the pattern
data/processed/<name>/<name>.csv
into a single file:
data/combined/data.csv
The script streams each source CSV in 1Miorow chunks so memory stays low.
Typos in the historic column names (protocl/classfication) are fixed onthefly.
Usage
-----
python combined.py
You can optionally supply a different root directory:
python combined.py --root other/processed_dir --out other/combined/data.csv
"""
from __future__ import annotations
import argparse
from pathlib import Path
import os
import pandas as pd
CHUNK = 1_000_000 # rows per read_csv chunk
def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
"""Rename legacy columns to canonical names."""
return df.rename(
columns={"protocl": "protocol", "classfication": "classification"}
)
def find_source_csvs(proc_root: Path):
"""Yield CSV paths that exactly match processed/<name>/<name>.csv."""
for sub in sorted(proc_root.iterdir()):
if not sub.is_dir():
continue
target = sub / f"{sub.name}.csv"
if target.exists():
yield target
def combine(proc_root: Path, out_path: Path):
out_path.parent.mkdir(parents=True, exist_ok=True)
first_write = True
for csv_path in find_source_csvs(proc_root):
print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
chunk = fix_cols(chunk)
chunk.to_csv(
out_path,
mode="w" if first_write else "a",
header=first_write,
index=False,
)
first_write = False
print(f"✓ combined CSV written to {out_path}")
def main():
p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
p.add_argument("--root", default="data/processed", help="processed dir root")
p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
args = p.parse_args()
combine(Path(args.root).expanduser(), Path(args.out).expanduser())
if __name__ == "__main__":
main()