mirror of
https://github.com/ltcptgeneral/IdealRMT-DecisionTrees.git
synced 2025-09-04 14:27:23 +00:00
75 lines
2.1 KiB
Python
75 lines
2.1 KiB
Python
#!/usr/bin/env python3
|
||
"""combined.py
|
||
|
||
Concatenate every CSV that matches the pattern
|
||
data/processed/<name>/<name>.csv
|
||
into a single file:
|
||
data/combined/data.csv
|
||
|
||
The script streams each source CSV in 1‑Mio‑row chunks so memory stays low.
|
||
Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly.
|
||
|
||
Usage
|
||
-----
|
||
python combined.py
|
||
|
||
You can optionally supply a different root directory:
|
||
python combined.py --root other/processed_dir --out other/combined/data.csv
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
from pathlib import Path
|
||
import os
|
||
import pandas as pd
|
||
|
||
CHUNK = 1_000_000 # rows per read_csv chunk
|
||
|
||
|
||
def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""Rename legacy columns to canonical names."""
|
||
return df.rename(
|
||
columns={"protocl": "protocol", "classfication": "classification"}
|
||
)
|
||
|
||
|
||
def find_source_csvs(proc_root: Path):
|
||
"""Yield CSV paths that exactly match processed/<name>/<name>.csv."""
|
||
for sub in sorted(proc_root.iterdir()):
|
||
if not sub.is_dir():
|
||
continue
|
||
target = sub / f"{sub.name}.csv"
|
||
if target.exists():
|
||
yield target
|
||
|
||
|
||
def combine(proc_root: Path, out_path: Path):
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
first_write = True
|
||
for csv_path in find_source_csvs(proc_root):
|
||
print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
|
||
for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
|
||
chunk = fix_cols(chunk)
|
||
chunk.to_csv(
|
||
out_path,
|
||
mode="w" if first_write else "a",
|
||
header=first_write,
|
||
index=False,
|
||
)
|
||
first_write = False
|
||
print(f"✓ combined CSV written to {out_path}")
|
||
|
||
|
||
def main():
|
||
p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
|
||
p.add_argument("--root", default="data/processed", help="processed dir root")
|
||
p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
|
||
args = p.parse_args()
|
||
|
||
combine(Path(args.root).expanduser(), Path(args.out).expanduser())
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|