Add support for combined datasets and analysis

This commit is contained in:
Jai Parera
2025-06-11 20:34:09 -07:00
parent 541538fcfe
commit 24fc2ed6f7
11 changed files with 1082 additions and 5 deletions

44
sanity_check/csvdiff.py Normal file
View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""
csvdiff.py file1.csv file2.csv
Streams both files; prints the first differing line or
No differences found. Uses O(1) memory.
"""
import sys
from itertools import zip_longest
from pathlib import Path
def open_checked(p: str):
print(p)
path = Path(p)
try:
return path.open("r", newline=""), path
except FileNotFoundError:
sys.exit(f"Error: {path} not found")
def human(n: int) -> str:
return f"{n:,}"
def main(a_path: str, b_path: str) -> None:
fa, a = open_checked(a_path)
fb, b = open_checked(b_path)
with fa, fb:
for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
if ra != rb:
print(f"Files differ at line {human(idx)}")
if ra is None:
print(f"{a} ended early")
elif rb is None:
print(f"{b} ended early")
else:
print(f"{a}: {ra.rstrip()}")
print(f"{b}: {rb.rstrip()}")
return
print("No differences found")
if __name__ == "__main__":
if len(sys.argv) != 3:
sys.exit("Usage: csvdiff.py file1.csv file2.csv")
main(sys.argv[1], sys.argv[2])