mirror of
https://github.com/ltcptgeneral/IdealRMT-DecisionTrees.git
synced 2025-09-09 16:57:23 +00:00
Compare commits
1 Commits
24fc2ed6f7
...
decision-t
Author | SHA1 | Date | |
---|---|---|---|
|
fda251f051 |
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -1,2 +0,0 @@
|
|||||||
# force LF for any shell script
|
|
||||||
*.sh text eol=lf
|
|
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,5 +1,5 @@
|
|||||||
data.*
|
data.*
|
||||||
__pycache__
|
__pycache__
|
||||||
*.json
|
*.json
|
||||||
|
.DS_Store
|
||||||
data/*
|
.ipynb_checkpoints/
|
||||||
|
File diff suppressed because one or more lines are too long
@@ -89,7 +89,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "switch",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -103,7 +103,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.7"
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
10
README.md
10
README.md
@@ -2,21 +2,17 @@
|
|||||||
|
|
||||||
Run `pip install -r requirements.txt`
|
Run `pip install -r requirements.txt`
|
||||||
|
|
||||||
Run `setup.sh`
|
|
||||||
|
|
||||||
# Tree Generation
|
# Tree Generation
|
||||||
|
|
||||||
## Download Dataset
|
## Download Dataset
|
||||||
|
|
||||||
Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
|
Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
|
||||||
|
|
||||||
Place these into the `data/tar` folder.
|
Rename the file as data.pcap
|
||||||
|
|
||||||
Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`.
|
|
||||||
|
|
||||||
## Preprocessing Dataset
|
## Preprocessing Dataset
|
||||||
|
|
||||||
Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree.
|
Run `ExtractDataset.ipynb`, this will take a few minutes
|
||||||
|
|
||||||
## Training
|
## Training
|
||||||
|
|
||||||
|
@@ -241,7 +241,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "switch",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -255,7 +255,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.7"
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@@ -117,8 +117,8 @@
|
|||||||
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
"id mapping: \n",
|
"id mapping: \n",
|
||||||
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
||||||
"TCAM bits: 13312\n",
|
"TCAM bits: 13184\n",
|
||||||
"RAM bits: 522\n"
|
"RAM bits: 504\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -263,8 +263,8 @@
|
|||||||
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
"id mapping: \n",
|
"id mapping: \n",
|
||||||
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
||||||
"TCAM bits: 3520\n",
|
"TCAM bits: 3320\n",
|
||||||
"RAM bits: 522\n"
|
"RAM bits: 504\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -368,8 +368,8 @@
|
|||||||
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
"id mapping: \n",
|
"id mapping: \n",
|
||||||
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
||||||
"TCAM bits: 2120\n",
|
"TCAM bits: 2152\n",
|
||||||
"RAM bits: 522\n"
|
"RAM bits: 504\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -382,7 +382,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "switch",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -396,7 +396,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.7"
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
74
combine.py
74
combine.py
@@ -1,74 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""combined.py
|
|
||||||
|
|
||||||
Concatenate every CSV that matches the pattern
|
|
||||||
data/processed/<name>/<name>.csv
|
|
||||||
into a single file:
|
|
||||||
data/combined/data.csv
|
|
||||||
|
|
||||||
The script streams each source CSV in 1‑Mio‑row chunks so memory stays low.
|
|
||||||
Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly.
|
|
||||||
|
|
||||||
Usage
|
|
||||||
-----
|
|
||||||
python combined.py
|
|
||||||
|
|
||||||
You can optionally supply a different root directory:
|
|
||||||
python combined.py --root other/processed_dir --out other/combined/data.csv
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
import os
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
CHUNK = 1_000_000 # rows per read_csv chunk
|
|
||||||
|
|
||||||
|
|
||||||
def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""Rename legacy columns to canonical names."""
|
|
||||||
return df.rename(
|
|
||||||
columns={"protocl": "protocol", "classfication": "classification"}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def find_source_csvs(proc_root: Path):
|
|
||||||
"""Yield CSV paths that exactly match processed/<name>/<name>.csv."""
|
|
||||||
for sub in sorted(proc_root.iterdir()):
|
|
||||||
if not sub.is_dir():
|
|
||||||
continue
|
|
||||||
target = sub / f"{sub.name}.csv"
|
|
||||||
if target.exists():
|
|
||||||
yield target
|
|
||||||
|
|
||||||
|
|
||||||
def combine(proc_root: Path, out_path: Path):
|
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
first_write = True
|
|
||||||
for csv_path in find_source_csvs(proc_root):
|
|
||||||
print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
|
|
||||||
for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
|
|
||||||
chunk = fix_cols(chunk)
|
|
||||||
chunk.to_csv(
|
|
||||||
out_path,
|
|
||||||
mode="w" if first_write else "a",
|
|
||||||
header=first_write,
|
|
||||||
index=False,
|
|
||||||
)
|
|
||||||
first_write = False
|
|
||||||
print(f"✓ combined CSV written to {out_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
|
|
||||||
p.add_argument("--root", default="data/processed", help="processed dir root")
|
|
||||||
p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
|
|
||||||
args = p.parse_args()
|
|
||||||
|
|
||||||
combine(Path(args.root).expanduser(), Path(args.out).expanduser())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -1,80 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from pathlib import Path
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from labels import mac_to_label
|
|
||||||
from tqdm import tqdm
|
|
||||||
import os
|
|
||||||
|
|
||||||
ROOT = Path(__file__).resolve().parent
|
|
||||||
PCAP_DIR = ROOT / "data" / "pcap"
|
|
||||||
CSV_DIR = ROOT / "data" / "processed"
|
|
||||||
CSV_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
BATCH = 100_000 # packets per chunk
|
|
||||||
|
|
||||||
from scapy.all import rdpcap
|
|
||||||
|
|
||||||
|
|
||||||
def process_pcap(pcap_path: str, csv_path: str) -> None:
|
|
||||||
all_packets = rdpcap(pcap_path)
|
|
||||||
|
|
||||||
print("rdpcap done", flush=True)
|
|
||||||
results = []
|
|
||||||
for packet in tqdm(all_packets):
|
|
||||||
size = len(packet)
|
|
||||||
try:
|
|
||||||
proto = packet.proto
|
|
||||||
except AttributeError:
|
|
||||||
proto = 0
|
|
||||||
try:
|
|
||||||
sport = packet.sport
|
|
||||||
dport = packet.dport
|
|
||||||
except AttributeError:
|
|
||||||
sport = 0
|
|
||||||
dport = 0
|
|
||||||
|
|
||||||
proto = int(proto)
|
|
||||||
sport = int(sport)
|
|
||||||
dport = int(dport)
|
|
||||||
|
|
||||||
if "Ether" in packet:
|
|
||||||
eth_dst = packet["Ether"].dst
|
|
||||||
if eth_dst in mac_to_label:
|
|
||||||
classification = mac_to_label[eth_dst]
|
|
||||||
else:
|
|
||||||
classification = "other"
|
|
||||||
else:
|
|
||||||
classification = "other"
|
|
||||||
|
|
||||||
metric = [proto,sport,dport,classification]
|
|
||||||
results.append(metric)
|
|
||||||
results = (np.array(results)).T
|
|
||||||
|
|
||||||
# store the features in the dataframe
|
|
||||||
dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
|
|
||||||
columns = ['protocl','src','dst','classfication']
|
|
||||||
|
|
||||||
# save the dataframe to the csv file, if not exsit, create one.
|
|
||||||
if os.path.exists(csv_path):
|
|
||||||
dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
|
|
||||||
else:
|
|
||||||
dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)
|
|
||||||
|
|
||||||
print("Done")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
|
|
||||||
rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
|
|
||||||
csv_path = CSV_DIR / rel_csv
|
|
||||||
if csv_path.exists():
|
|
||||||
print(f"Skip {rel_csv} (CSV exists)")
|
|
||||||
continue
|
|
||||||
print(f"Processing {rel_csv}")
|
|
||||||
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
process_pcap(str(pcap), str(csv_path))
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -1,50 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Usage: extract_all.sh SOURCE_DIR TARGET_DIR
|
|
||||||
# For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR:
|
|
||||||
# 1. Create TARGET_DIR/<name>/
|
|
||||||
# 2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive.
|
|
||||||
# 3. Otherwise, extract the archive into its own folder.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
if [[ $# -ne 2 ]]; then
|
|
||||||
echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
src_dir="$1"
|
|
||||||
dst_dir="$2"
|
|
||||||
mkdir -p "$dst_dir"
|
|
||||||
|
|
||||||
# Strip common extensions to recover the base name
|
|
||||||
strip_ext() {
|
|
||||||
local n="$1"
|
|
||||||
n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar}
|
|
||||||
echo "$n"
|
|
||||||
}
|
|
||||||
|
|
||||||
shopt -s nullglob
|
|
||||||
for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do
|
|
||||||
base=$(basename "$archive")
|
|
||||||
name=$(strip_ext "$base")
|
|
||||||
out_dir="$dst_dir/$name"
|
|
||||||
key_file="$out_dir/$name.pcap"
|
|
||||||
|
|
||||||
if [[ -f "$key_file" ]]; then
|
|
||||||
echo "Skipping $archive — $key_file already present"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Extracting $archive into $out_dir"
|
|
||||||
mkdir -p "$out_dir"
|
|
||||||
|
|
||||||
case "$archive" in
|
|
||||||
*.tar) tar -xf "$archive" -C "$out_dir" ;;
|
|
||||||
*.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;;
|
|
||||||
*.tar.bz2) tar -xjf "$archive" -C "$out_dir" ;;
|
|
||||||
*.tar.xz) tar -xJf "$archive" -C "$out_dir" ;;
|
|
||||||
*) echo "Unknown type: $archive" ;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "All archives processed."
|
|
@@ -4,4 +4,3 @@ pandas
|
|||||||
scikit-learn
|
scikit-learn
|
||||||
pydotplus
|
pydotplus
|
||||||
matplotlib
|
matplotlib
|
||||||
scipy
|
|
@@ -1,44 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
csvdiff.py file1.csv file2.csv
|
|
||||||
Streams both files; prints the first differing line or
|
|
||||||
‘No differences found’. Uses O(1) memory.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from itertools import zip_longest
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
def open_checked(p: str):
|
|
||||||
print(p)
|
|
||||||
path = Path(p)
|
|
||||||
try:
|
|
||||||
return path.open("r", newline=""), path
|
|
||||||
except FileNotFoundError:
|
|
||||||
sys.exit(f"Error: {path} not found")
|
|
||||||
|
|
||||||
def human(n: int) -> str:
|
|
||||||
return f"{n:,}"
|
|
||||||
|
|
||||||
def main(a_path: str, b_path: str) -> None:
|
|
||||||
fa, a = open_checked(a_path)
|
|
||||||
fb, b = open_checked(b_path)
|
|
||||||
|
|
||||||
with fa, fb:
|
|
||||||
for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
|
|
||||||
if ra != rb:
|
|
||||||
print(f"Files differ at line {human(idx)}")
|
|
||||||
if ra is None:
|
|
||||||
print(f"{a} ended early")
|
|
||||||
elif rb is None:
|
|
||||||
print(f"{b} ended early")
|
|
||||||
else:
|
|
||||||
print(f"{a}: {ra.rstrip()}")
|
|
||||||
print(f"{b}: {rb.rstrip()}")
|
|
||||||
return
|
|
||||||
print("No differences found")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if len(sys.argv) != 3:
|
|
||||||
sys.exit("Usage: csvdiff.py file1.csv file2.csv")
|
|
||||||
main(sys.argv[1], sys.argv[2])
|
|
File diff suppressed because one or more lines are too long
@@ -1,206 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""diversity_metrics.py (fast version)
|
|
||||||
|
|
||||||
Estimate how much diversity each CSV adds without building a giant in‑memory
|
|
||||||
DataFrame. Designed for IoT packet logs with millions of rows.
|
|
||||||
|
|
||||||
Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back
|
|
||||||
to pandas plain text).
|
|
||||||
|
|
||||||
Usage
|
|
||||||
-----
|
|
||||||
python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000]
|
|
||||||
|
|
||||||
Metrics
|
|
||||||
-------
|
|
||||||
ΔEntropy : change in Shannon entropy of *classification* counts
|
|
||||||
ΔGini : change in Gini impurity of the same counts
|
|
||||||
χ² p : Pearson χ² p‑value old vs new classification counts
|
|
||||||
Jaccard : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new)
|
|
||||||
KS src p : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling)
|
|
||||||
KS dst p : Kolmogorov–Smirnov p‑value, dest‑port dist (uses sampling)
|
|
||||||
|
|
||||||
Speed tricks
|
|
||||||
------------
|
|
||||||
* No growing DataFrame; we keep Counters / sets / lists.
|
|
||||||
* Ports for KS are *sampled* (default 50 k) to bound cost.
|
|
||||||
* (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead.
|
|
||||||
* pandas reads via **pyarrow** engine when available.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import Counter
|
|
||||||
from typing import List, Set
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from scipy.stats import chi2_contingency, ks_2samp, entropy
|
|
||||||
|
|
||||||
try:
|
|
||||||
from tabulate import tabulate
|
|
||||||
_USE_TABULATE = True
|
|
||||||
except ImportError:
|
|
||||||
_USE_TABULATE = False
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Helper metrics
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def shannon(counts: Counter) -> float:
|
|
||||||
total = sum(counts.values())
|
|
||||||
if total == 0:
|
|
||||||
return 0.0
|
|
||||||
p = np.fromiter(counts.values(), dtype=float)
|
|
||||||
p /= total
|
|
||||||
return entropy(p, base=2)
|
|
||||||
|
|
||||||
|
|
||||||
def gini(counts: Counter) -> float:
|
|
||||||
total = sum(counts.values())
|
|
||||||
if total == 0:
|
|
||||||
return 0.0
|
|
||||||
return 1.0 - sum((n / total) ** 2 for n in counts.values())
|
|
||||||
|
|
||||||
|
|
||||||
def jaccard(a: Set[int], b: Set[int]) -> float:
|
|
||||||
if not a and not b:
|
|
||||||
return 1.0
|
|
||||||
return len(a & b) / len(a | b)
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Core analysis
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def analyse(csv_files: List[Path], sample_size: int):
|
|
||||||
"""Return list of dicts with diversity metrics for each added file."""
|
|
||||||
|
|
||||||
# cumulative state (no big DataFrame!)
|
|
||||||
class_counter: Counter = Counter()
|
|
||||||
pair_hashes: Set[int] = set()
|
|
||||||
src_list: List[int] = []
|
|
||||||
dst_list: List[int] = []
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
|
|
||||||
for csv_path in csv_files:
|
|
||||||
df = pd.read_csv(
|
|
||||||
csv_path,
|
|
||||||
engine="pyarrow" if pd.__version__ >= "2" else "c", # fast parse
|
|
||||||
usecols=["protocl", "src", "dst", "classfication"],
|
|
||||||
dtype={
|
|
||||||
"protocl": "uint16",
|
|
||||||
"protocol": "uint16",
|
|
||||||
"src": "uint16",
|
|
||||||
"dst": "uint16",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
# normalise column names
|
|
||||||
df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True)
|
|
||||||
|
|
||||||
# snapshot previous state
|
|
||||||
prev_class = class_counter.copy()
|
|
||||||
prev_pairs = pair_hashes.copy()
|
|
||||||
prev_src = np.asarray(src_list, dtype=np.uint16)
|
|
||||||
prev_dst = np.asarray(dst_list, dtype=np.uint16)
|
|
||||||
|
|
||||||
# --- update cumulative structures ------------------------------------
|
|
||||||
class_counter.update(df["classification"].value_counts().to_dict())
|
|
||||||
|
|
||||||
# hash (src,dst) into 32‑bit int to save memory
|
|
||||||
pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \
|
|
||||||
df["dst"].to_numpy(dtype=np.uint32)
|
|
||||||
|
|
||||||
|
|
||||||
# extend port lists (keep small ints)
|
|
||||||
src_list.extend(df["src"].tolist())
|
|
||||||
dst_list.extend(df["dst"].tolist())
|
|
||||||
|
|
||||||
# --- metrics ----------------------------------------------------------
|
|
||||||
# χ² classification
|
|
||||||
chi_p = np.nan
|
|
||||||
if prev_class:
|
|
||||||
all_classes = list(set(prev_class) | set(df["classification"].unique()))
|
|
||||||
old = [prev_class.get(c, 0) for c in all_classes]
|
|
||||||
new = [df["classification"].value_counts().get(c, 0) for c in all_classes]
|
|
||||||
_, chi_p, _, _ = chi2_contingency([old, new])
|
|
||||||
|
|
||||||
# entropy & gini deltas
|
|
||||||
delta_entropy = shannon(class_counter) - shannon(prev_class)
|
|
||||||
delta_gini = gini(class_counter) - gini(prev_class)
|
|
||||||
|
|
||||||
# Jaccard on pair hashes
|
|
||||||
jc = jaccard(prev_pairs, pair_hashes)
|
|
||||||
|
|
||||||
# KS tests on sampled ports
|
|
||||||
ks_src_p = ks_dst_p = np.nan
|
|
||||||
if prev_src.size:
|
|
||||||
new_src = df["src"].to_numpy(dtype=np.uint16)
|
|
||||||
new_dst = df["dst"].to_numpy(dtype=np.uint16)
|
|
||||||
if prev_src.size > sample_size:
|
|
||||||
prev_src_sample = np.random.choice(prev_src, sample_size, replace=False)
|
|
||||||
else:
|
|
||||||
prev_src_sample = prev_src
|
|
||||||
if new_src.size > sample_size:
|
|
||||||
new_src_sample = np.random.choice(new_src, sample_size, replace=False)
|
|
||||||
else:
|
|
||||||
new_src_sample = new_src
|
|
||||||
if prev_dst.size > sample_size:
|
|
||||||
prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False)
|
|
||||||
else:
|
|
||||||
prev_dst_sample = prev_dst
|
|
||||||
if new_dst.size > sample_size:
|
|
||||||
new_dst_sample = np.random.choice(new_dst, sample_size, replace=False)
|
|
||||||
else:
|
|
||||||
new_dst_sample = new_dst
|
|
||||||
|
|
||||||
ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue
|
|
||||||
ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue
|
|
||||||
|
|
||||||
rows.append(
|
|
||||||
{
|
|
||||||
"File": csv_path.name,
|
|
||||||
"Rows": len(df),
|
|
||||||
"ΔEntropy": round(delta_entropy, 4),
|
|
||||||
"ΔGini": round(delta_gini, 4),
|
|
||||||
"χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA",
|
|
||||||
"Jaccard": round(jc, 3),
|
|
||||||
"KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA",
|
|
||||||
"KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return rows
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# CLI
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def main():
|
|
||||||
ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).")
|
|
||||||
ap.add_argument("csv_dir", help="Directory containing CSV files")
|
|
||||||
ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir")
|
|
||||||
ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)")
|
|
||||||
args = ap.parse_args()
|
|
||||||
|
|
||||||
root = Path(args.csv_dir)
|
|
||||||
pattern = "**/*.csv" if args.recursive else "*.csv"
|
|
||||||
csv_files = sorted(root.glob(pattern))
|
|
||||||
if not csv_files:
|
|
||||||
print("No CSV files found.")
|
|
||||||
return
|
|
||||||
|
|
||||||
table_rows = analyse(csv_files, args.sample)
|
|
||||||
|
|
||||||
if _USE_TABULATE:
|
|
||||||
print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f"))
|
|
||||||
else:
|
|
||||||
print(pd.DataFrame(table_rows).to_string(index=False))
|
|
||||||
|
|
||||||
print(
|
|
||||||
"\nLegend:\n • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)"
|
|
||||||
"\n • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info"
|
|
||||||
"\n • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant."
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
14
setup.sh
14
setup.sh
@@ -1,14 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Creates the directory layout:
|
|
||||||
# data/
|
|
||||||
# tar/
|
|
||||||
# pcap/
|
|
||||||
# processed/
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
|
|
||||||
mkdir -p "$root"/data/{tar,pcap,processed,combined}
|
|
||||||
|
|
||||||
echo "Directory structure ready under $root/data/"
|
|
Reference in New Issue
Block a user