Add support for combined datasets and analysis

2026-02-22 22:51:55 +00:00 · 2025-06-11 20:34:09 -07:00
parent 541538fcfe
commit 24fc2ed6f7
11 changed files with 1082 additions and 5 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# force LF for any shell script
+*.sh text eol=lf
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 data.*
 __pycache__
-*.json
+*.json
+
+data/*
--- a/README.md
+++ b/README.md
@@ -2,17 +2,21 @@

 Run `pip install -r requirements.txt`

+Run `setup.sh`
+
 # Tree Generation

 ## Download Dataset

-Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
+Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc

-Rename the file as data.pcap
+Place these into the `data/tar` folder.
+
+Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`.

 ## Preprocessing Dataset

-Run `ExtractDataset.ipynb`, this will take a few minutes
+Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree.

 ## Training

--- a/combine.py
+++ b/combine.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""combined.py
+
+Concatenate every CSV that matches the pattern
+    data/processed/<name>/<name>.csv
+into a single file:
+    data/combined/data.csv
+
+The script streams each source CSV in 1‑Mio‑row chunks so memory stays low.
+Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly.
+
+Usage
+-----
+python combined.py
+
+You can optionally supply a different root directory:
+python combined.py --root other/processed_dir --out other/combined/data.csv
+"""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+import os
+import pandas as pd
+
+CHUNK = 1_000_000  # rows per read_csv chunk
+
+
+def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
+    """Rename legacy columns to canonical names."""
+    return df.rename(
+        columns={"protocl": "protocol", "classfication": "classification"}
+    )
+
+
+def find_source_csvs(proc_root: Path):
+    """Yield CSV paths that exactly match processed/<name>/<name>.csv."""
+    for sub in sorted(proc_root.iterdir()):
+        if not sub.is_dir():
+            continue
+        target = sub / f"{sub.name}.csv"
+        if target.exists():
+            yield target
+
+
+def combine(proc_root: Path, out_path: Path):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    first_write = True
+    for csv_path in find_source_csvs(proc_root):
+        print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
+        for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
+            chunk = fix_cols(chunk)
+            chunk.to_csv(
+                out_path,
+                mode="w" if first_write else "a",
+                header=first_write,
+                index=False,
+            )
+            first_write = False
+    print(f"✓ combined CSV written to {out_path}")
+
+
+def main():
+    p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
+    p.add_argument("--root", default="data/processed", help="processed dir root")
+    p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
+    args = p.parse_args()
+
+    combine(Path(args.root).expanduser(), Path(args.out).expanduser())
+
+
+if __name__ == "__main__":
+    main()
--- a/extract_all_datasets.py
+++ b/extract_all_datasets.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from labels import mac_to_label
+from tqdm import tqdm
+import os
+
+ROOT       = Path(__file__).resolve().parent
+PCAP_DIR   = ROOT / "data" / "pcap"
+CSV_DIR    = ROOT / "data" / "processed"
+CSV_DIR.mkdir(parents=True, exist_ok=True)
+
+BATCH = 100_000   # packets per chunk
+
+from scapy.all import rdpcap
+
+
+def process_pcap(pcap_path: str, csv_path: str) -> None:
+    all_packets = rdpcap(pcap_path)
+
+    print("rdpcap done", flush=True)
+    results = []
+    for packet in tqdm(all_packets):
+        size = len(packet)
+        try:
+            proto = packet.proto
+        except AttributeError:
+            proto = 0
+        try:
+            sport = packet.sport
+            dport = packet.dport
+        except AttributeError:
+            sport = 0
+            dport = 0
+
+        proto = int(proto)
+        sport = int(sport)
+        dport = int(dport)
+
+        if "Ether" in packet:
+            eth_dst = packet["Ether"].dst
+            if eth_dst in mac_to_label:
+                classification = mac_to_label[eth_dst]
+            else:
+                classification = "other"
+        else:
+            classification = "other"
+
+        metric = [proto,sport,dport,classification]
+        results.append(metric)
+    results = (np.array(results)).T
+
+    # store the features in the dataframe
+    dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
+    columns = ['protocl','src','dst','classfication']
+
+    # save the dataframe to the csv file, if not exsit, create one.
+    if os.path.exists(csv_path):
+        dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
+    else:
+        dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)
+        
+    print("Done")
+
+
+
+def main() -> None:
+    for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
+        rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
+        csv_path = CSV_DIR / rel_csv
+        if csv_path.exists():
+            print(f"Skip {rel_csv} (CSV exists)")
+            continue
+        print(f"Processing {rel_csv}")
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        process_pcap(str(pcap), str(csv_path))
+
+if __name__ == "__main__":
+    main()
--- a/extract_tars.sh
+++ b/extract_tars.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Usage: extract_all.sh SOURCE_DIR TARGET_DIR
+# For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR:
+#   1. Create TARGET_DIR/<name>/
+#   2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive.
+#   3. Otherwise, extract the archive into its own folder.
+
+set -euo pipefail
+
+if [[ $# -ne 2 ]]; then
+  echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2
+  exit 1
+fi
+
+src_dir="$1"
+dst_dir="$2"
+mkdir -p "$dst_dir"
+
+# Strip common extensions to recover the base name
+strip_ext() {
+  local n="$1"
+  n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar}
+  echo "$n"
+}
+
+shopt -s nullglob
+for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do
+  base=$(basename "$archive")
+  name=$(strip_ext "$base")
+  out_dir="$dst_dir/$name"
+  key_file="$out_dir/$name.pcap"
+
+  if [[ -f "$key_file" ]]; then
+    echo "Skipping $archive  —  $key_file already present"
+    continue
+  fi
+
+  echo "Extracting $archive into $out_dir"
+  mkdir -p "$out_dir"
+
+  case "$archive" in
+    *.tar)          tar -xf "$archive" -C "$out_dir" ;;
+    *.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;;
+    *.tar.bz2)      tar -xjf "$archive" -C "$out_dir" ;;
+    *.tar.xz)       tar -xJf "$archive" -C "$out_dir" ;;
+    *)              echo "Unknown type: $archive" ;;
+  esac
+done
+
+echo "All archives processed."
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ numpy
 pandas
 scikit-learn
 pydotplus
-matplotlib
+matplotlib
+scipy
--- a/sanity_check/csvdiff.py
+++ b/sanity_check/csvdiff.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+csvdiff.py file1.csv file2.csv
+Streams both files; prints the first differing line or
+‘No differences found’. Uses O(1) memory.
+"""
+
+import sys
+from itertools import zip_longest
+from pathlib import Path
+
+def open_checked(p: str):
+    print(p)
+    path = Path(p)
+    try:
+        return path.open("r", newline=""), path
+    except FileNotFoundError:
+        sys.exit(f"Error: {path} not found")
+
+def human(n: int) -> str:
+    return f"{n:,}"
+
+def main(a_path: str, b_path: str) -> None:
+    fa, a = open_checked(a_path)
+    fb, b = open_checked(b_path)
+
+    with fa, fb:
+        for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
+            if ra != rb:
+                print(f"Files differ at line {human(idx)}")
+                if ra is None:
+                    print(f"{a} ended early")
+                elif rb is None:
+                    print(f"{b} ended early")
+                else:
+                    print(f"{a}: {ra.rstrip()}")
+                    print(f"{b}: {rb.rstrip()}")
+                return
+    print("No differences found")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        sys.exit("Usage: csvdiff.py file1.csv file2.csv")
+    main(sys.argv[1], sys.argv[2])
--- a/sanity_check/data_visualization.ipynb
+++ b/sanity_check/data_visualization.ipynb
--- a/sanity_check/diversity_metrics.py
+++ b/sanity_check/diversity_metrics.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""diversity_metrics.py (fast version)
+
+Estimate how much diversity each CSV adds without building a giant in‑memory
+DataFrame.  Designed for IoT packet logs with millions of rows.
+
+Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back
+to pandas plain text).
+
+Usage
+-----
+python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000]
+
+Metrics
+-------
+ΔEntropy  : change in Shannon entropy of *classification* counts
+ΔGini     : change in Gini impurity of the same counts
+χ² p      : Pearson χ² p‑value old vs new classification counts
+Jaccard   : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new)
+KS src p  : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling)
+KS dst p  : Kolmogorov–Smirnov p‑value, dest‑port  dist (uses sampling)
+
+Speed tricks
+------------
+* No growing DataFrame; we keep Counters / sets / lists.
+* Ports for KS are *sampled* (default 50 k) to bound cost.
+* (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead.
+* pandas reads via **pyarrow** engine when available.
+"""
+
+import argparse
+from pathlib import Path
+from collections import Counter
+from typing import List, Set
+
+import numpy as np
+import pandas as pd
+from scipy.stats import chi2_contingency, ks_2samp, entropy
+
+try:
+    from tabulate import tabulate
+    _USE_TABULATE = True
+except ImportError:
+    _USE_TABULATE = False
+
+# -----------------------------------------------------------------------------
+# Helper metrics
+# -----------------------------------------------------------------------------
+
+def shannon(counts: Counter) -> float:
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    p = np.fromiter(counts.values(), dtype=float)
+    p /= total
+    return entropy(p, base=2)
+
+
+def gini(counts: Counter) -> float:
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    return 1.0 - sum((n / total) ** 2 for n in counts.values())
+
+
+def jaccard(a: Set[int], b: Set[int]) -> float:
+    if not a and not b:
+        return 1.0
+    return len(a & b) / len(a | b)
+
+# -----------------------------------------------------------------------------
+# Core analysis
+# -----------------------------------------------------------------------------
+
+def analyse(csv_files: List[Path], sample_size: int):
+    """Return list of dicts with diversity metrics for each added file."""
+
+    # cumulative state (no big DataFrame!)
+    class_counter: Counter = Counter()
+    pair_hashes: Set[int] = set()
+    src_list: List[int] = []
+    dst_list: List[int] = []
+
+    rows = []
+
+    for csv_path in csv_files:
+        df = pd.read_csv(
+            csv_path,
+            engine="pyarrow" if pd.__version__ >= "2" else "c",  # fast parse
+            usecols=["protocl", "src", "dst", "classfication"],
+            dtype={
+                "protocl": "uint16",
+                "protocol": "uint16",
+                "src": "uint16",
+                "dst": "uint16",
+            },
+        )
+        # normalise column names
+        df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True)
+
+        # snapshot previous state
+        prev_class = class_counter.copy()
+        prev_pairs = pair_hashes.copy()
+        prev_src = np.asarray(src_list, dtype=np.uint16)
+        prev_dst = np.asarray(dst_list, dtype=np.uint16)
+
+        # --- update cumulative structures ------------------------------------
+        class_counter.update(df["classification"].value_counts().to_dict())
+
+        # hash (src,dst) into 32‑bit int to save memory
+        pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \
+            df["dst"].to_numpy(dtype=np.uint32)
+
+
+        # extend port lists (keep small ints)
+        src_list.extend(df["src"].tolist())
+        dst_list.extend(df["dst"].tolist())
+
+        # --- metrics ----------------------------------------------------------
+        # χ² classification
+        chi_p = np.nan
+        if prev_class:
+            all_classes = list(set(prev_class) | set(df["classification"].unique()))
+            old = [prev_class.get(c, 0) for c in all_classes]
+            new = [df["classification"].value_counts().get(c, 0) for c in all_classes]
+            _, chi_p, _, _ = chi2_contingency([old, new])
+
+        # entropy & gini deltas
+        delta_entropy = shannon(class_counter) - shannon(prev_class)
+        delta_gini = gini(class_counter) - gini(prev_class)
+
+        # Jaccard on pair hashes
+        jc = jaccard(prev_pairs, pair_hashes)
+
+        # KS tests on sampled ports
+        ks_src_p = ks_dst_p = np.nan
+        if prev_src.size:
+            new_src = df["src"].to_numpy(dtype=np.uint16)
+            new_dst = df["dst"].to_numpy(dtype=np.uint16)
+            if prev_src.size > sample_size:
+                prev_src_sample = np.random.choice(prev_src, sample_size, replace=False)
+            else:
+                prev_src_sample = prev_src
+            if new_src.size > sample_size:
+                new_src_sample = np.random.choice(new_src, sample_size, replace=False)
+            else:
+                new_src_sample = new_src
+            if prev_dst.size > sample_size:
+                prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False)
+            else:
+                prev_dst_sample = prev_dst
+            if new_dst.size > sample_size:
+                new_dst_sample = np.random.choice(new_dst, sample_size, replace=False)
+            else:
+                new_dst_sample = new_dst
+
+            ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue
+            ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue
+
+        rows.append(
+            {
+                "File": csv_path.name,
+                "Rows": len(df),
+                "ΔEntropy": round(delta_entropy, 4),
+                "ΔGini": round(delta_gini, 4),
+                "χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA",
+                "Jaccard": round(jc, 3),
+                "KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA",
+                "KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA",
+            }
+        )
+    return rows
+
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+
+def main():
+    ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).")
+    ap.add_argument("csv_dir", help="Directory containing CSV files")
+    ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir")
+    ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)")
+    args = ap.parse_args()
+
+    root = Path(args.csv_dir)
+    pattern = "**/*.csv" if args.recursive else "*.csv"
+    csv_files = sorted(root.glob(pattern))
+    if not csv_files:
+        print("No CSV files found.")
+        return
+
+    table_rows = analyse(csv_files, args.sample)
+
+    if _USE_TABULATE:
+        print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f"))
+    else:
+        print(pd.DataFrame(table_rows).to_string(index=False))
+
+    print(
+        "\nLegend:\n  • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)"
+        "\n  • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info"
+        "\n  • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant."
+    )
+
+if __name__ == "__main__":
+    main()
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# Creates the directory layout:
+#   data/
+#     tar/
+#     pcap/
+#     processed/
+
+set -euo pipefail
+
+root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+mkdir -p "$root"/data/{tar,pcap,processed,combined}
+
+echo "Directory structure ready under $root/data/"