Add support for combined datasets and analysis

2026-07-16 19:43:06 +00:00 · 2025-06-11 20:34:09 -07:00
parent 541538fcfe
commit 24fc2ed6f7
11 changed files with 1082 additions and 5 deletions
@@ -0,0 +1,2 @@
 # force LF for any shell script
 *.sh text eol=lf
@@ -1,3 +1,5 @@
 data.*
 __pycache__
-*.json
+*.json
 data/*
@@ -2,17 +2,21 @@
 Run `pip install -r requirements.txt`
 Run `setup.sh`
 # Tree Generation
 ## Download Dataset
-Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
+Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
-Rename the file as data.pcap
+Place these into the `data/tar` folder.
 Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`.
 ## Preprocessing Dataset
-Run `ExtractDataset.ipynb`, this will take a few minutes
+Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree.
 ## Training
@@ -0,0 +1,74 @@
 #!/usr/bin/env python3
 """combined.py
 Concatenate every CSV that matches the pattern
    data/processed/<name>/<name>.csv
 into a single file:
    data/combined/data.csv
 The script streams each source CSV in 1‑Mio‑row chunks so memory stays low.
 Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly.
 Usage
 -----
 python combined.py
 You can optionally supply a different root directory:
 python combined.py --root other/processed_dir --out other/combined/data.csv
 """
 from __future__ import annotations
 import argparse
 from pathlib import Path
 import os
 import pandas as pd
 CHUNK = 1_000_000  # rows per read_csv chunk
 def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Rename legacy columns to canonical names."""
    return df.rename(
        columns={"protocl": "protocol", "classfication": "classification"}
    )
 def find_source_csvs(proc_root: Path):
    """Yield CSV paths that exactly match processed/<name>/<name>.csv."""
    for sub in sorted(proc_root.iterdir()):
        if not sub.is_dir():
            continue
        target = sub / f"{sub.name}.csv"
        if target.exists():
            yield target
 def combine(proc_root: Path, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    first_write = True
    for csv_path in find_source_csvs(proc_root):
        print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
        for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
            chunk = fix_cols(chunk)
            chunk.to_csv(
                out_path,
                mode="w" if first_write else "a",
                header=first_write,
                index=False,
            )
            first_write = False
    print(f"✓ combined CSV written to {out_path}")
 def main():
    p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
    p.add_argument("--root", default="data/processed", help="processed dir root")
    p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
    args = p.parse_args()
    combine(Path(args.root).expanduser(), Path(args.out).expanduser())
 if __name__ == "__main__":
    main()
@@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from labels import mac_to_label
 from tqdm import tqdm
 import os
 ROOT       = Path(__file__).resolve().parent
 PCAP_DIR   = ROOT / "data" / "pcap"
 CSV_DIR    = ROOT / "data" / "processed"
 CSV_DIR.mkdir(parents=True, exist_ok=True)
 BATCH = 100_000   # packets per chunk
 from scapy.all import rdpcap
 def process_pcap(pcap_path: str, csv_path: str) -> None:
    all_packets = rdpcap(pcap_path)
    print("rdpcap done", flush=True)
    results = []
    for packet in tqdm(all_packets):
        size = len(packet)
        try:
            proto = packet.proto
        except AttributeError:
            proto = 0
        try:
            sport = packet.sport
            dport = packet.dport
        except AttributeError:
            sport = 0
            dport = 0
        proto = int(proto)
        sport = int(sport)
        dport = int(dport)
        if "Ether" in packet:
            eth_dst = packet["Ether"].dst
            if eth_dst in mac_to_label:
                classification = mac_to_label[eth_dst]
            else:
                classification = "other"
        else:
            classification = "other"
        metric = [proto,sport,dport,classification]
        results.append(metric)
    results = (np.array(results)).T
    # store the features in the dataframe
    dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
    columns = ['protocl','src','dst','classfication']
    # save the dataframe to the csv file, if not exsit, create one.
    if os.path.exists(csv_path):
        dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
    else:
        dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)
    print("Done")
 def main() -> None:
    for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
        rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
        csv_path = CSV_DIR / rel_csv
        if csv_path.exists():
            print(f"Skip {rel_csv} (CSV exists)")
            continue
        print(f"Processing {rel_csv}")
        csv_path.parent.mkdir(parents=True, exist_ok=True)
        process_pcap(str(pcap), str(csv_path))
 if __name__ == "__main__":
    main()
@@ -0,0 +1,50 @@
 #!/usr/bin/env bash
 # Usage: extract_all.sh SOURCE_DIR TARGET_DIR
 # For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR:
 #   1. Create TARGET_DIR/<name>/
 #   2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive.
 #   3. Otherwise, extract the archive into its own folder.
 set -euo pipefail
 if [[ $# -ne 2 ]]; then
  echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2
  exit 1
 fi
 src_dir="$1"
 dst_dir="$2"
 mkdir -p "$dst_dir"
 # Strip common extensions to recover the base name
 strip_ext() {
  local n="$1"
  n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar}
  echo "$n"
 }
 shopt -s nullglob
 for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do
  base=$(basename "$archive")
  name=$(strip_ext "$base")
  out_dir="$dst_dir/$name"
  key_file="$out_dir/$name.pcap"
  if [[ -f "$key_file" ]]; then
    echo "Skipping $archive  —  $key_file already present"
    continue
  fi
  echo "Extracting $archive into $out_dir"
  mkdir -p "$out_dir"
  case "$archive" in
    *.tar)          tar -xf "$archive" -C "$out_dir" ;;
    *.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;;
    *.tar.bz2)      tar -xjf "$archive" -C "$out_dir" ;;
    *.tar.xz)       tar -xJf "$archive" -C "$out_dir" ;;
    *)              echo "Unknown type: $archive" ;;
  esac
 done
 echo "All archives processed."
@@ -3,4 +3,5 @@ numpy
 pandas
 scikit-learn
 pydotplus
-matplotlib
+matplotlib
 scipy
@@ -0,0 +1,44 @@
 #!/usr/bin/env python3
 """
 csvdiff.py file1.csv file2.csv
 Streams both files; prints the first differing line or
 ‘No differences found’. Uses O(1) memory.
 """
 import sys
 from itertools import zip_longest
 from pathlib import Path
 def open_checked(p: str):
    print(p)
    path = Path(p)
    try:
        return path.open("r", newline=""), path
    except FileNotFoundError:
        sys.exit(f"Error: {path} not found")
 def human(n: int) -> str:
    return f"{n:,}"
 def main(a_path: str, b_path: str) -> None:
    fa, a = open_checked(a_path)
    fb, b = open_checked(b_path)
    with fa, fb:
        for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
            if ra != rb:
                print(f"Files differ at line {human(idx)}")
                if ra is None:
                    print(f"{a} ended early")
                elif rb is None:
                    print(f"{b} ended early")
                else:
                    print(f"{a}: {ra.rstrip()}")
                    print(f"{b}: {rb.rstrip()}")
                return
    print("No differences found")
 if __name__ == "__main__":
    if len(sys.argv) != 3:
        sys.exit("Usage: csvdiff.py file1.csv file2.csv")
    main(sys.argv[1], sys.argv[2])
@@ -0,0 +1,206 @@
 #!/usr/bin/env python3
 """diversity_metrics.py (fast version)
 Estimate how much diversity each CSV adds without building a giant in‑memory
 DataFrame.  Designed for IoT packet logs with millions of rows.
 Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back
 to pandas plain text).
 Usage
 -----
 python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000]
 Metrics
 -------
 ΔEntropy  : change in Shannon entropy of *classification* counts
 ΔGini     : change in Gini impurity of the same counts
 χ² p      : Pearson χ² p‑value old vs new classification counts
 Jaccard   : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new)
 KS src p  : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling)
 KS dst p  : Kolmogorov–Smirnov p‑value, dest‑port  dist (uses sampling)
 Speed tricks
 ------------
 * No growing DataFrame; we keep Counters / sets / lists.
 * Ports for KS are *sampled* (default 50 k) to bound cost.
 * (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead.
 * pandas reads via **pyarrow** engine when available.
 """
 import argparse
 from pathlib import Path
 from collections import Counter
 from typing import List, Set
 import numpy as np
 import pandas as pd
 from scipy.stats import chi2_contingency, ks_2samp, entropy
 try:
    from tabulate import tabulate
    _USE_TABULATE = True
 except ImportError:
    _USE_TABULATE = False
 # -----------------------------------------------------------------------------
 # Helper metrics
 # -----------------------------------------------------------------------------
 def shannon(counts: Counter) -> float:
    total = sum(counts.values())
    if total == 0:
        return 0.0
    p = np.fromiter(counts.values(), dtype=float)
    p /= total
    return entropy(p, base=2)
 def gini(counts: Counter) -> float:
    total = sum(counts.values())
    if total == 0:
        return 0.0
    return 1.0 - sum((n / total) ** 2 for n in counts.values())
 def jaccard(a: Set[int], b: Set[int]) -> float:
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b)
 # -----------------------------------------------------------------------------
 # Core analysis
 # -----------------------------------------------------------------------------
 def analyse(csv_files: List[Path], sample_size: int):
    """Return list of dicts with diversity metrics for each added file."""
    # cumulative state (no big DataFrame!)
    class_counter: Counter = Counter()
    pair_hashes: Set[int] = set()
    src_list: List[int] = []
    dst_list: List[int] = []
    rows = []
    for csv_path in csv_files:
        df = pd.read_csv(
            csv_path,
            engine="pyarrow" if pd.__version__ >= "2" else "c",  # fast parse
            usecols=["protocl", "src", "dst", "classfication"],
            dtype={
                "protocl": "uint16",
                "protocol": "uint16",
                "src": "uint16",
                "dst": "uint16",
            },
        )
        # normalise column names
        df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True)
        # snapshot previous state
        prev_class = class_counter.copy()
        prev_pairs = pair_hashes.copy()
        prev_src = np.asarray(src_list, dtype=np.uint16)
        prev_dst = np.asarray(dst_list, dtype=np.uint16)
        # --- update cumulative structures ------------------------------------
        class_counter.update(df["classification"].value_counts().to_dict())
        # hash (src,dst) into 32‑bit int to save memory
        pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \
            df["dst"].to_numpy(dtype=np.uint32)
        # extend port lists (keep small ints)
        src_list.extend(df["src"].tolist())
        dst_list.extend(df["dst"].tolist())
        # --- metrics ----------------------------------------------------------
        # χ² classification
        chi_p = np.nan
        if prev_class:
            all_classes = list(set(prev_class) | set(df["classification"].unique()))
            old = [prev_class.get(c, 0) for c in all_classes]
            new = [df["classification"].value_counts().get(c, 0) for c in all_classes]
            _, chi_p, _, _ = chi2_contingency([old, new])
        # entropy & gini deltas
        delta_entropy = shannon(class_counter) - shannon(prev_class)
        delta_gini = gini(class_counter) - gini(prev_class)
        # Jaccard on pair hashes
        jc = jaccard(prev_pairs, pair_hashes)
        # KS tests on sampled ports
        ks_src_p = ks_dst_p = np.nan
        if prev_src.size:
            new_src = df["src"].to_numpy(dtype=np.uint16)
            new_dst = df["dst"].to_numpy(dtype=np.uint16)
            if prev_src.size > sample_size:
                prev_src_sample = np.random.choice(prev_src, sample_size, replace=False)
            else:
                prev_src_sample = prev_src
            if new_src.size > sample_size:
                new_src_sample = np.random.choice(new_src, sample_size, replace=False)
            else:
                new_src_sample = new_src
            if prev_dst.size > sample_size:
                prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False)
            else:
                prev_dst_sample = prev_dst
            if new_dst.size > sample_size:
                new_dst_sample = np.random.choice(new_dst, sample_size, replace=False)
            else:
                new_dst_sample = new_dst
            ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue
            ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue
        rows.append(
            {
                "File": csv_path.name,
                "Rows": len(df),
                "ΔEntropy": round(delta_entropy, 4),
                "ΔGini": round(delta_gini, 4),
                "χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA",
                "Jaccard": round(jc, 3),
                "KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA",
                "KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA",
            }
        )
    return rows
 # -----------------------------------------------------------------------------
 # CLI
 # -----------------------------------------------------------------------------
 def main():
    ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).")
    ap.add_argument("csv_dir", help="Directory containing CSV files")
    ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir")
    ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)")
    args = ap.parse_args()
    root = Path(args.csv_dir)
    pattern = "**/*.csv" if args.recursive else "*.csv"
    csv_files = sorted(root.glob(pattern))
    if not csv_files:
        print("No CSV files found.")
        return
    table_rows = analyse(csv_files, args.sample)
    if _USE_TABULATE:
        print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f"))
    else:
        print(pd.DataFrame(table_rows).to_string(index=False))
    print(
        "\nLegend:\n  • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)"
        "\n  • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info"
        "\n  • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant."
    )
 if __name__ == "__main__":
    main()
@@ -0,0 +1,14 @@
 #!/usr/bin/env bash
 # Creates the directory layout:
 #   data/
 #     tar/
 #     pcap/
 #     processed/
 set -euo pipefail
 root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 mkdir -p "$root"/data/{tar,pcap,processed,combined}
 echo "Directory structure ready under $root/data/"
		`@@ -0,0 +1,2 @@`
							`# force LF for any shell script`
							`*.sh text eol=lf`