#!/usr/bin/env python3 """combined.py Concatenate every CSV that matches the pattern data/processed//.csv into a single file: data/combined/data.csv The script streams each source CSV in 1‑Mio‑row chunks so memory stays low. Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly. Usage ----- python combined.py You can optionally supply a different root directory: python combined.py --root other/processed_dir --out other/combined/data.csv """ from __future__ import annotations import argparse from pathlib import Path import os import pandas as pd CHUNK = 1_000_000 # rows per read_csv chunk def fix_cols(df: pd.DataFrame) -> pd.DataFrame: """Rename legacy columns to canonical names.""" return df.rename( columns={"protocl": "protocol", "classfication": "classification"} ) def find_source_csvs(proc_root: Path): """Yield CSV paths that exactly match processed//.csv.""" for sub in sorted(proc_root.iterdir()): if not sub.is_dir(): continue target = sub / f"{sub.name}.csv" if target.exists(): yield target def combine(proc_root: Path, out_path: Path): out_path.parent.mkdir(parents=True, exist_ok=True) first_write = True for csv_path in find_source_csvs(proc_root): print(f"→ adding {csv_path.relative_to(proc_root.parent)}") for chunk in pd.read_csv(csv_path, chunksize=CHUNK): chunk = fix_cols(chunk) chunk.to_csv( out_path, mode="w" if first_write else "a", header=first_write, index=False, ) first_write = False print(f"✓ combined CSV written to {out_path}") def main(): p = argparse.ArgumentParser(description="Combine processed CSVs into one.") p.add_argument("--root", default="data/processed", help="processed dir root") p.add_argument("--out", default="data/combined/data.csv", help="output CSV") args = p.parse_args() combine(Path(args.root).expanduser(), Path(args.out).expanduser()) if __name__ == "__main__": main()