25 Commits

Author SHA1 Message Date
51f920e2ba upload example tree based on 10-01 data 2025-06-20 03:18:09 +00:00
1136bd93ea update readme 2025-06-14 03:10:48 +00:00
2ad40946d1 update treetormt 2025-06-12 19:53:46 +00:00
50075b1acc Merge remote-tracking branch 'origin/decision-tree-nudge-boundaries' 2025-06-12 19:52:02 +00:00
Jai Parera
1585399c7d Fixed loop ordering and path_to_class in JSON 2025-06-11 23:37:33 -07:00
8301998da3 temp fix for issue with metadata 2025-06-12 06:10:39 +00:00
3b2d6b3186 fix eval bug in parser 2025-06-12 05:37:48 +00:00
Jai Parera
24fc2ed6f7 Add support for combined datasets and analysis 2025-06-11 20:38:37 -07:00
Nathan Huey
fda251f051 First try boundary nudging 2025-06-11 16:55:23 -07:00
541538fcfe update decision tree results 2025-06-11 22:37:38 +00:00
Arthur Lu
afc882a569 Merge pull request #5 from ltcptgeneral/Parser
Fix range bug in TreeCompress, update parser to autofail non-singular…
2025-06-11 12:11:39 -07:00
6de3807fe2 fix range bug in TreeCompress, update parser to autofail non-singular classifications 2025-06-11 19:10:19 +00:00
Arthur Lu
fc16d3c586 Merge pull request #4 from ltcptgeneral/Parser
Eval compressed tree accuracy
2025-06-11 11:28:38 -07:00
7bee40ecf9 restore TreeCompress and TreeToRMT from main 2025-06-11 18:28:18 +00:00
krishpatel
e811171a73 Implemented working compressed tree parser to get classification accuracy 2025-06-11 11:10:49 -07:00
61a451b82d fix counting issue in ram bits 2025-06-11 04:47:35 +00:00
c73de36c70 replace classes with class string instead of index 2025-06-11 04:41:32 +00:00
fadeab8a99 fix incorrect classes in TreeCompress,
closes #1
2025-06-08 18:34:39 +00:00
c208037ae9 implement priority aware algorithm,
add dataset size printout
2025-06-07 01:10:05 +00:00
ae3128f6e8 better output formatting for tcam/ram bits 2025-06-05 16:56:10 +00:00
25e5a86a43 implement correct prefix counting 2025-06-05 03:42:30 +00:00
d3fe6efd47 add worst_case converter to rmt 2025-06-04 22:56:17 +00:00
23867747cd rename RMTConvert to TreeCompress 2025-06-02 21:54:43 +00:00
eeebc17d56 add ideal-rmt-simulator as submodule 2025-06-02 21:52:10 +00:00
0d5e51f582 add list of paths and classes to compressed_tree output 2025-06-02 21:48:10 +00:00
23 changed files with 4643 additions and 54 deletions

2
.gitattributes vendored Normal file
View File

@@ -0,0 +1,2 @@
# force LF for any shell script
*.sh text eol=lf

5
.gitignore vendored
View File

@@ -1,4 +1,5 @@
data.*
__pycache__
tree.json
compressed_tree.json
data/*
.DS_Store
.ipynb_checkpoints/

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "ideal-rmt-simulator"]
path = ideal-rmt-simulator
url = https://github.com/rfchang/ideal-rmt-simulator

152
CompressedTreeParser.ipynb Normal file
View File

@@ -0,0 +1,152 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 138,
"id": "938dec51",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import argparse\n",
"from sklearn.tree import DecisionTreeClassifier, plot_tree, _tree\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.tree import export_graphviz\n",
"import pydotplus\n",
"from matplotlib import pyplot as plt\n",
"from labels import mac_to_label\n",
"import json\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 139,
"id": "442624c7",
"metadata": {},
"outputs": [],
"source": [
"Set1 = pd.read_csv('data/combined/data.csv').values.tolist()\n",
"X = [i[0:3] for i in Set1]\n",
"Y =[i[3] for i in Set1]"
]
},
{
"cell_type": "code",
"execution_count": 142,
"id": "12ad454d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'0': 20, '1': 20, '2': 9, '3': 20, '4': 0, '5': 13, '6': 20, '7': 0, '8': 12, '9': 4, '10': 20, '11': 4, '12': 1, '13': 16, '14': 20, '15': 2, '16': 20, '17': 0, '18': 20, '19': 20, '20': 20, '21': 20, '22': 20, '23': 1, '24': 2, '25': 20, '26': 13, '27': 11, '28': 20, '29': 20}\n"
]
}
],
"source": [
"predict_Yt = []\n",
"index=0\n",
"\n",
"with open('compressed_tree.json', 'r') as file:\n",
" data = json.load(file)\n",
" classes = data[\"classes\"]\n",
" for x in X:\n",
" counter = 0\n",
" class_set = []\n",
" paths_set = []\n",
" features = [\"protocol\", \"src\", \"dst\"]\n",
" for feature in features:\n",
" if feature in data[\"layers\"]:\n",
" for node in data['layers'][feature]:\n",
" if node['min'] is None:\n",
" if x[counter] <= node['max']:\n",
" class_set.append(node['classes'])\n",
" paths_set.append(node[\"paths\"])\n",
" break #is this an issue?\n",
" else:\n",
" continue\n",
" elif node['max'] is None:\n",
" if node['min'] < x[counter]:\n",
" class_set.append(node['classes'])\n",
" paths_set.append(node[\"paths\"])\n",
" break #is this an issue?\n",
" else:\n",
" continue\n",
" elif node['min'] < x[counter] and x[counter] <= node['max']:\n",
" class_set.append(node['classes'])\n",
" paths_set.append(node[\"paths\"])\n",
" break #is this an issue?\n",
"\n",
" counter += 1\n",
" result = set(class_set[0])\n",
" paths = set(paths_set[0])\n",
" for s in class_set[1:]:\n",
" result.intersection_update(s)\n",
" for s in paths_set[1:]:\n",
" paths.intersection_update(s)\n",
"\n",
" #predict_Yt.append(list(result))\n",
" #print(result)\n",
" if len(paths) != 1:\n",
" print(paths)\n",
" print(x)\n",
" print(result)\n",
" assert len(paths) == 1\n",
" path = list(paths)[0]\n",
" pred = data[\"path_to_class\"][str(path)]\n",
" pred_class = classes[pred]\n",
" predict_Yt.append(pred_class)\n",
" \n",
" index += 1"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "8b4c56b6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8410252791654538\n"
]
}
],
"source": [
"correct = 0\n",
"for i in range(len(Y)):\n",
" prediction = predict_Yt[i]\n",
" if prediction != None and Y[i] == prediction:\n",
" correct += 1\n",
"\n",
"print(correct / len(Y))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -89,7 +89,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "switch",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -103,7 +103,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
"version": "3.12.9"
}
},
"nbformat": 4,

View File

@@ -2,18 +2,30 @@
Run `pip install -r requirements.txt`
Run `setup.sh`
# Tree Generation
## Download Dataset
Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
Rename the file as data.pcap
Place these into the `data/tar` folder.
Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`.
## Preprocessing Dataset
Run `ExtractDataset.ipynb`, this will take a few minutes
Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree.
## Training
Run `DecisionTree.ipynb`, the tree should be output in `tree`
Run `DecisionTree.ipynb`, the tree should be output in `tree.json`
## Compression
Run `TreeCompress.ipynb`, the tree should be output in `compressed_tree.json`
## RMT
Run `TreeToRMT.ipynb`, it will report the TCAM and SRAM usage of the compressed tree

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 73,
"id": "ec310f34",
"metadata": {},
"outputs": [],
@@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 74,
"id": "5b54797e",
"metadata": {},
"outputs": [],
@@ -28,22 +28,25 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 75,
"id": "a38fdb8a",
"metadata": {},
"outputs": [],
"source": [
"# First cleanup the tree by rounding the decision points to integer values\n",
"# We assume all features will use integer values. If this is not the case, then training data should be normalized so that integer values can be accurate enough\n",
"# we also enumerate all the paths for later use\n",
"\n",
"i = 0\n",
"\n",
"path_ids = set()\n",
"path_classes = tree[\"classes\"]\n",
"\n",
"# for each path in the tree\n",
"for path in paths:\n",
"\t# assign a path id \n",
"\tpath[\"id\"] = i\n",
"\ti += 1\n",
"\tpath_ids.add(i)\n",
"\t#path_classes.add(path[\"classification\"])\n",
"\ti += 1\t\n",
"\t# for each condition\n",
"\tconditions = path[\"conditions\"]\n",
"\tfor condition in conditions:\n",
@@ -57,7 +60,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 76,
"id": "2fd4f738",
"metadata": {},
"outputs": [],
@@ -80,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 77,
"id": "98cde024",
"metadata": {},
"outputs": [],
@@ -106,13 +109,13 @@
"\t\tvalue = condition[\"value\"]\n",
"\n",
"\t\t# move the min/max for the corresponding feature in compressed\n",
"\t\tif operation == \"<=\" and compressed[feature][\"min\"] is None:\n",
"\t\tif operation == \"<=\" and compressed[feature][\"max\"] is None:\n",
"\t\t\tcompressed[feature][\"max\"] = value\n",
"\t\telif operation == \">\" and compressed[feature][\"max\"] is None:\n",
"\t\telif operation == \">\" and compressed[feature][\"min\"] is None:\n",
"\t\t\tcompressed[feature][\"min\"] = value\n",
"\t\telif operation == \"<=\" and value < compressed[feature][\"min\"]:\n",
"\t\telif operation == \"<=\" and value < compressed[feature][\"max\"]:\n",
"\t\t\tcompressed[feature][\"max\"] = value\n",
"\t\telif operation == \">\" and value > compressed[feature][\"max\"]:\n",
"\t\telif operation == \">\" and value > compressed[feature][\"min\"]:\n",
"\t\t\tcompressed[feature][\"min\"] = value\n",
"\n",
"\tpath[\"compressed\"] = compressed"
@@ -120,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 78,
"id": "b6fbadbf",
"metadata": {},
"outputs": [],
@@ -168,49 +171,64 @@
"\t\t# for each bucket which encompases the condition's range, add this path's id to the sets \n",
"\t\ti = 0\n",
"\t\tfor bp in breakpoints[feature_name]:\n",
"\t\t\tin_range = is_in_range(bp, lower, upper)\n",
"\t\t\tif in_range:\n",
"\t\t\tif is_in_range(bp, lower, upper):\n",
"\t\t\t\tbuckets_id[feature_name][i].add(ID)\n",
"\t\t\t\tbuckets_class[feature_name][i].add(Class)\n",
"\t\t\ti += 1"
"\t\t\ti += 1\n",
"\n",
"\t\tif is_in_range(bp+1, lower, upper):\n",
"\t\t\tbuckets_id[feature_name][i].add(ID)\n",
"\t\t\tbuckets_class[feature_name][i].add(Class)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 79,
"id": "0a767971",
"metadata": {},
"outputs": [],
"source": [
"# combine breakpoints and buckets to one representation\n",
"\n",
"compressed_tree = defaultdict(list)\n",
"compressed_layers = defaultdict(list)\n",
"for feature_name in buckets_id:\n",
"\tlower = None\n",
"\tupper = breakpoints[feature_name][0]\n",
"\tpaths = buckets_id[feature_name][0]\n",
"\tclasses = buckets_class[feature_name][0]\n",
"\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n",
"\tcompressed_tree[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
"\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
"\tfor i in range(1, len(buckets_id[feature_name]) - 1):\n",
"\t\tlower = breakpoints[feature_name][i-1]\n",
"\t\tupper = breakpoints[feature_name][i]\n",
"\t\tmembers = buckets_id[feature_name][i]\n",
"\t\tpaths = buckets_id[feature_name][i]\n",
"\t\tclasses = buckets_class[feature_name][i]\n",
"\t\t#print(f\"{feature_name} = [{lower}, {upper}]: {buckets[feature_name][i]}\")\n",
"\t\tcompressed_tree[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
"\t\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
"\tlower = breakpoints[feature_name][len(breakpoints[feature_name]) - 1]\n",
"\tupper = None\n",
"\tmembers = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n",
"\tpaths = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n",
"\tclasses = buckets_class[feature_name][len(buckets_class[feature_name]) - 1]\n",
"\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n",
"\tcompressed_tree[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
"\t#print(\"=\"*40)"
"\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
"\t#print(\"=\"*40)\n",
"\n",
"path_to_class = {}\n",
"for i in range(len(tree[\"paths\"])):\n",
" path = tree[\"paths\"][i]\n",
" path_to_class[path[\"id\"]] = path[\"classification\"]\n",
"\n",
"compressed_tree = {\n",
"\t\"paths\": path_ids,\n",
"\t\"classes\": path_classes,\n",
"\t\"layers\": compressed_layers,\n",
" \"path_to_class\": path_to_class,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 80,
"id": "561b0bc1",
"metadata": {},
"outputs": [],
@@ -229,7 +247,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "switch",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -243,7 +261,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
"version": "3.12.9"
}
},
"nbformat": 4,

412
TreeToRMT.ipynb Normal file
View File

@@ -0,0 +1,412 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "58fc6db9",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e07be4b3",
"metadata": {},
"outputs": [],
"source": [
"f = open(\"compressed_tree.json\")\n",
"tree = json.loads(f.read())\n",
"layers = tree[\"layers\"]\n",
"classes = tree[\"classes\"]\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1516ff91",
"metadata": {},
"outputs": [],
"source": [
"field_width = {\n",
"\t\"src\": 16,\n",
"\t\"dst\": 16,\n",
"\t\"protocl\": 8,\n",
"}"
]
},
{
"cell_type": "markdown",
"id": "f9193827",
"metadata": {},
"source": [
"# Worst Case RMT"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5e37cfc5",
"metadata": {},
"outputs": [],
"source": [
"def worst_case_rmt(tree):\n",
"\trmt = []\n",
"\tstep = 0\n",
"\n",
"\ttcam_bits = 0\n",
"\tram_bits = 0\n",
"\n",
"\tfor layer in layers:\n",
"\t\tnum_ranges = len(layers[layer])\n",
"\t\t# assume that each range requires all of 2*k prefixes when performing prefix expansion\n",
"\t\t# therefore there are 2*k * R for R ranges and width k\n",
"\t\tnum_prefixes = 2 * field_width[layer] * num_ranges\n",
"\t\tprefix_width = field_width[layer]\n",
"\n",
"\t\ttcam = {\n",
"\t\t\t\"id\": f\"{layer}_range\",\n",
"\t\t\t\"step\": step,\n",
"\t\t\t\"match\": \"ternary\",\n",
"\t\t\t\"entries\": num_prefixes,\n",
"\t\t\t\"key_size\": prefix_width\n",
"\t\t}\n",
"\t\ttcam_bits += num_prefixes * prefix_width\n",
"\n",
"\t\t# assume basic pointer reuse for metadata storage\n",
"\t\tram = {\n",
"\t\t\t\"id\": f\"{layer}_meta\",\n",
"\t\t\t\"step\": step,\n",
"\t\t\t\"match\": \"exact\",\n",
"\t\t\t\"method\": \"index\",\n",
"\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
"\t\t\t\"data_size\": len(classes)\n",
"\t\t}\n",
"\t\tram_bits += num_ranges * len(classes)\n",
"\n",
"\t\trmt.append(tcam)\n",
"\t\trmt.append(ram)\n",
"\n",
"\t\tstep += 1\n",
"\n",
"\treturn rmt, tcam_bits, ram_bits\n",
"\n",
"x, tcam_bits, ram_bits = worst_case_rmt(tree)\n",
"f = open(\"worst_case_rmt.json\", \"w+\")\n",
"f.write(json.dumps(x, indent=4))\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0dc1d6d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TCAM mapping: \n",
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"SRAM mapping: \n",
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"id mapping: \n",
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
"TCAM bits: 13184\n",
"RAM bits: 504\n"
]
}
],
"source": [
"! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n",
"print(f\"TCAM bits: {tcam_bits}\")\n",
"print(f\"RAM bits: {ram_bits}\")"
]
},
{
"cell_type": "markdown",
"id": "2a628655",
"metadata": {},
"source": [
"# Naive Range Expansion "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fb9febe9",
"metadata": {},
"outputs": [],
"source": [
"# shamelessly stolen from: https://github.com/autolyticus/range-to-prefix/blob/master/rangetoprefix.C\n",
"\n",
"def int_to_bin(i, width):\n",
"\treturn bin(i)[2:].zfill(width)\n",
"\n",
"def increment_dc(pfx):\n",
"\tidx = pfx.find(\"*\")\n",
"\tif idx == -1:\n",
"\t\tidx = len(pfx)\n",
"\tidx = idx - 1\n",
"\t#print(pfx, pfx[:idx])\n",
"\treturn pfx[:idx] + \"*\" + pfx[idx+1:]\n",
"\t\n",
"def can_merge(pfx_a, pfx_b):\n",
"\tpfx_a = pfx_a.replace(\"*\", \"\")\n",
"\tpfx_b = pfx_b.replace(\"*\", \"\")\n",
"\treturn pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1]\n",
"\n",
"def merge(pfx_a, prefixes):\n",
"\tpfx_a = increment_dc(pfx_a)\n",
"\tprefixes[-1] = pfx_a\n",
"\n",
"\tfor i in range(len(prefixes) - 2, -1, -1):\n",
"\t\tif can_merge(prefixes[i], prefixes[i+1]):\n",
"\t\t\tprefixes.pop()\n",
"\t\t\tpfx = increment_dc(prefixes[i])\n",
"\t\t\tprefixes[i] = pfx\n",
"\n",
"def convert_range(lower, upper, width):\n",
"\tprefixes = []\n",
"\tprefix = int_to_bin(lower, width)\n",
"\tprefixes.append(prefix)\n",
"\tnorm_upper = min(upper, 2**width-1)\n",
"\tfor i in range(lower+1, norm_upper+1):\n",
"\t\tprefix = int_to_bin(i, width)\n",
"\t\tif can_merge(prefix, prefixes[-1]):\n",
"\t\t\tmerge(prefix, prefixes)\n",
"\t\telse:\n",
"\t\t\tprefixes.append(prefix)\n",
"\treturn prefixes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "55167c28",
"metadata": {},
"outputs": [],
"source": [
"def naive_rmt(tree):\n",
"\trmt = []\n",
"\tstep = 0\n",
"\n",
"\ttcam_bits = 0\n",
"\tram_bits = 0\n",
"\n",
"\tfor layer in layers:\n",
"\t\tnum_prefixes = 0\n",
"\t\tprefix_width = field_width[layer]\n",
"\t\t# for each range in the layer, convert the ranges to prefixes using naive range expansion\n",
"\t\tfor r in layers[layer]:\n",
"\t\t\tif r[\"min\"] == None:\n",
"\t\t\t\tr[\"min\"] = 0\n",
"\t\t\telif r[\"max\"] == None:\n",
"\t\t\t\tr[\"max\"] = 2 ** prefix_width\n",
"\t\t\tprefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n",
"\t\t\tr[\"prefixes\"] = prefixes\n",
"\t\t\tnum_prefixes += len(prefixes)\n",
"\t\t\ttcam_bits += len(prefixes) * prefix_width\n",
"\n",
"\t\ttcam = {\n",
"\t\t\t\"id\": f\"{layer}_range\",\n",
"\t\t\t\"step\": step,\n",
"\t\t\t\"match\": \"ternary\",\n",
"\t\t\t\"entries\": num_prefixes,\n",
"\t\t\t\"key_size\": prefix_width,\n",
"\t\t\t\"ranges\": layers[layer]\n",
"\t\t}\n",
"\n",
"\t\tnum_ranges = len(layers[layer])\n",
"\t\t# assume no pointer reuse for metadata storage\n",
"\t\tram = {\n",
"\t\t\t\"id\": f\"{layer}_meta\",\n",
"\t\t\t\"step\": step,\n",
"\t\t\t\"match\": \"exact\",\n",
"\t\t\t\"method\": \"index\",\n",
"\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
"\t\t\t\"data_size\": len(classes)\n",
"\t\t}\n",
"\t\tram_bits += num_ranges * len(classes)\n",
"\n",
"\t\trmt.append(tcam)\n",
"\t\trmt.append(ram)\n",
"\n",
"\t\tstep += 1\n",
"\n",
"\treturn rmt, tcam_bits, ram_bits\n",
"\n",
"x, tcam_bits, ram_bits = naive_rmt(tree)\n",
"f = open(\"naive_rmt.json\", \"w+\")\n",
"f.write(json.dumps(x, indent=4))\n",
"f.close()\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "48011528",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TCAM mapping: \n",
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"SRAM mapping: \n",
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"id mapping: \n",
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
"TCAM bits: 3320\n",
"RAM bits: 504\n"
]
}
],
"source": [
"! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n",
"print(f\"TCAM bits: {tcam_bits}\")\n",
"print(f\"RAM bits: {ram_bits}\")"
]
},
{
"cell_type": "markdown",
"id": "2504b1ba",
"metadata": {},
"source": [
"# Priority Aware Prefix Expansion"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "64b7271e",
"metadata": {},
"outputs": [],
"source": [
"# for this technique, we note that given disjoint ranges [0,a][a,b],[b,c] ...\n",
"# then if using a TCAM that selects the first matching prefix, then [0,a],[0,b],[0,c] would be equivalent\n",
"# this is because if for some k<a, even though the range [0,b] could be selected, as long as the prefixes for [0,a] are before [0,b] then the correct prefix will still be selected\n",
"\n",
"def priority_aware(tree):\n",
"\trmt = []\n",
"\tstep = 0\n",
"\n",
"\ttcam_bits = 0\n",
"\tram_bits = 0\n",
"\n",
"\tfor layer in layers:\n",
"\t\tnum_prefixes = 0\n",
"\t\tprefix_width = field_width[layer]\n",
"\t\t# for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0\n",
"\t\t# then check which set of prefixes would be better\n",
"\t\t# we will assume the ranges are already disjoin and in the correct order\n",
"\t\tfor r in layers[layer]:\n",
"\t\t\tif r[\"min\"] == None:\n",
"\t\t\t\tr[\"min\"] = 0\n",
"\t\t\telif r[\"max\"] == None:\n",
"\t\t\t\tr[\"max\"] = 2 ** prefix_width\n",
"\t\t\tregular_prefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n",
"\t\t\tzero_start_prefixes = convert_range(0, r[\"max\"], prefix_width)\n",
"\n",
"\t\t\tif len(regular_prefixes) <= len(zero_start_prefixes):\n",
"\t\t\t\tpfx_type = \"exact\"\n",
"\t\t\t\tprefixes = regular_prefixes\n",
"\t\t\telse:\n",
"\t\t\t\tpfx_type = \"zero\"\n",
"\t\t\t\tprefixes = zero_start_prefixes\n",
"\n",
"\t\t\tr[\"prefixes\"] = prefixes\n",
"\t\t\tr[\"prefix_type\"] = pfx_type\n",
"\t\t\tnum_prefixes += len(prefixes)\n",
"\t\t\ttcam_bits += len(prefixes) * prefix_width\n",
"\n",
"\t\ttcam = {\n",
"\t\t\t\"id\": f\"{layer}_range\",\n",
"\t\t\t\"step\": step,\n",
"\t\t\t\"match\": \"ternary\",\n",
"\t\t\t\"entries\": num_prefixes,\n",
"\t\t\t\"key_size\": prefix_width,\n",
"\t\t\t\"ranges\": layers[layer]\n",
"\t\t}\n",
"\n",
"\t\tnum_ranges = len(layers[layer])\n",
"\t\t# assume no pointer reuse for metadata storage\n",
"\t\tram = {\n",
"\t\t\t\"id\": f\"{layer}_meta\",\n",
"\t\t\t\"step\": step,\n",
"\t\t\t\"match\": \"exact\",\n",
"\t\t\t\"method\": \"index\",\n",
"\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
"\t\t\t\"data_size\": len(classes)\n",
"\t\t}\n",
"\t\tram_bits += num_ranges * len(classes)\n",
"\n",
"\t\trmt.append(tcam)\n",
"\t\trmt.append(ram)\n",
"\n",
"\t\tstep += 1\n",
"\n",
"\treturn rmt, tcam_bits, ram_bits\n",
"\n",
"x, tcam_bits, ram_bits = priority_aware(tree)\n",
"f = open(\"priority_aware.json\", \"w+\")\n",
"f.write(json.dumps(x, indent=4))\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "cd706e41",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TCAM mapping: \n",
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"SRAM mapping: \n",
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
"id mapping: \n",
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
"TCAM bits: 2152\n",
"RAM bits: 504\n"
]
}
],
"source": [
"! command python3 ideal-rmt-simulator/sim.py priority_aware.json\n",
"print(f\"TCAM bits: {tcam_bits}\")\n",
"print(f\"RAM bits: {ram_bits}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

74
combine.py Normal file
View File

@@ -0,0 +1,74 @@
#!/usr/bin/env python3
"""combined.py
Concatenate every CSV that matches the pattern
data/processed/<name>/<name>.csv
into a single file:
data/combined/data.csv
The script streams each source CSV in 1Miorow chunks so memory stays low.
Typos in the historic column names (protocl/classfication) are fixed onthefly.
Usage
-----
python combined.py
You can optionally supply a different root directory:
python combined.py --root other/processed_dir --out other/combined/data.csv
"""
from __future__ import annotations
import argparse
from pathlib import Path
import os
import pandas as pd
CHUNK = 1_000_000 # rows per read_csv chunk
def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
"""Rename legacy columns to canonical names."""
return df.rename(
columns={"protocl": "protocol", "classfication": "classification"}
)
def find_source_csvs(proc_root: Path):
"""Yield CSV paths that exactly match processed/<name>/<name>.csv."""
for sub in sorted(proc_root.iterdir()):
if not sub.is_dir():
continue
target = sub / f"{sub.name}.csv"
if target.exists():
yield target
def combine(proc_root: Path, out_path: Path):
out_path.parent.mkdir(parents=True, exist_ok=True)
first_write = True
for csv_path in find_source_csvs(proc_root):
print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
chunk = fix_cols(chunk)
chunk.to_csv(
out_path,
mode="w" if first_write else "a",
header=first_write,
index=False,
)
first_write = False
print(f"✓ combined CSV written to {out_path}")
def main():
p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
p.add_argument("--root", default="data/processed", help="processed dir root")
p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
args = p.parse_args()
combine(Path(args.root).expanduser(), Path(args.out).expanduser())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,560 @@
{
"paths": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
"Amazon Echo",
"Belkin Motion Sensor",
"Belkin Switch",
"Dropcam",
"HP Printer",
"LiFX Bulb",
"NEST Smoke Sensor",
"Netatmo Camera",
"Netatmo Weather station",
"Pixstart photo frame",
"Samsung Smart Cam",
"Smart Things",
"TP-Link Camera",
"TP-Link Plug",
"Triby Speaker",
"Withings",
"Withings Scale",
"Withings sleep sensor",
"iHome PowerPlug",
"other"
],
"layers": {
"dst": [
{
"min": null,
"max": 2136,
"paths": [
0,
1,
2,
3,
4,
5,
6
],
"classes": [
8,
19,
4
]
},
{
"min": 2136,
"max": 2224,
"paths": [
7
],
"classes": [
11
]
},
{
"min": 2224,
"max": 5016,
"paths": [
8,
9
],
"classes": [
1,
19
]
},
{
"min": 5016,
"max": 25848,
"paths": [
10,
11,
12
],
"classes": [
19,
7
]
},
{
"min": 25848,
"max": 47936,
"paths": [
10,
11,
13
],
"classes": [
19,
7
]
},
{
"min": 47936,
"max": 47944,
"paths": [
14
],
"classes": [
3
]
},
{
"min": 47944,
"max": 49152,
"paths": [
16,
15
],
"classes": [
10,
7
]
},
{
"min": 49152,
"max": 49160,
"paths": [
17,
18
],
"classes": [
16,
2
]
},
{
"min": 49160,
"max": null,
"paths": [
19,
20,
21,
22
],
"classes": [
17,
19,
15
]
}
],
"src": [
{
"min": null,
"max": 64,
"paths": [
0,
1,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
7,
11,
15,
16,
19
]
},
{
"min": 64,
"max": 128,
"paths": [
3,
5,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
]
},
{
"min": 128,
"max": 280,
"paths": [
3,
6,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
]
},
{
"min": 280,
"max": 816,
"paths": [
3,
6,
7,
8,
11,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
]
},
{
"min": 816,
"max": 1576,
"paths": [
4,
6,
7,
8,
11,
14,
15,
17,
19
],
"classes": [
3,
7,
11,
15,
16,
19
]
},
{
"min": 1576,
"max": 2488,
"paths": [
4,
6,
7,
8,
11,
14,
15,
18,
19
],
"classes": [
2,
3,
7,
11,
15,
19
]
},
{
"min": 2488,
"max": 4776,
"paths": [
4,
6,
7,
8,
11,
14,
16,
18,
19
],
"classes": [
2,
3,
7,
10,
11,
15,
19
]
},
{
"min": 4776,
"max": 5224,
"paths": [
4,
6,
7,
8,
11,
14,
16,
18,
20
],
"classes": [
2,
3,
7,
10,
11,
17,
19
]
},
{
"min": 5224,
"max": 9048,
"paths": [
4,
6,
7,
8,
12,
13,
14,
16,
18,
20
],
"classes": [
2,
3,
10,
11,
17,
19
]
},
{
"min": 9048,
"max": 43008,
"paths": [
4,
6,
7,
8,
12,
13,
14,
16,
18,
21
],
"classes": [
2,
3,
10,
11,
19
]
},
{
"min": 43008,
"max": 50384,
"paths": [
4,
6,
7,
9,
12,
13,
14,
16,
18,
21
],
"classes": [
1,
2,
3,
10,
11,
19
]
},
{
"min": 50384,
"max": null,
"paths": [
4,
6,
7,
9,
12,
13,
14,
16,
18,
22
],
"classes": [
1,
2,
3,
10,
11,
19
]
}
],
"protocl": [
{
"min": null,
"max": 0,
"paths": [
0,
2,
3,
4,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
1,
2,
3,
4,
7,
8,
10,
11,
15,
16,
17,
19
]
},
{
"min": 0,
"max": null,
"paths": [
1,
2,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
1,
2,
3,
7,
8,
10,
11,
15,
16,
17,
19
]
}
]
},
"path_to_class": {
"0": 19,
"1": 19,
"2": 8,
"3": 4,
"4": 19,
"5": 19,
"6": 19,
"7": 11,
"8": 19,
"9": 1,
"10": 19,
"11": 7,
"12": 19,
"13": 19,
"14": 3,
"15": 7,
"16": 10,
"17": 16,
"18": 2,
"19": 15,
"20": 17,
"21": 19,
"22": 19
}
}

734
example/naive_rmt.json Normal file
View File

@@ -0,0 +1,734 @@
[
{
"id": "dst_range",
"step": 0,
"match": "ternary",
"entries": 68,
"key_size": 16,
"ranges": [
{
"min": 0,
"max": 2136,
"paths": [
0,
1,
2,
3,
4,
5,
6
],
"classes": [
8,
19,
4
],
"prefixes": [
"00000***********",
"0000100000******",
"000010000100****",
"0000100001010***",
"0000100001011000"
]
},
{
"min": 2136,
"max": 2224,
"paths": [
7
],
"classes": [
11
],
"prefixes": [
"0000100001011***",
"00001000011*****",
"00001000100*****",
"000010001010****",
"0000100010110000"
]
},
{
"min": 2224,
"max": 5016,
"paths": [
8,
9
],
"classes": [
1,
19
],
"prefixes": [
"000010001011****",
"0000100011******",
"00001001********",
"0000101*********",
"000011**********",
"0001000*********",
"00010010********",
"000100110*******",
"000100111000****",
"0001001110010***",
"0001001110011000"
]
},
{
"min": 5016,
"max": 25848,
"paths": [
10,
11,
12
],
"classes": [
19,
7
],
"prefixes": [
"0001001110011***",
"00010011101*****",
"0001001111******",
"000101**********",
"00011***********",
"001*************",
"010*************",
"011000**********",
"011001000*******",
"0110010010******",
"01100100110*****",
"011001001110****",
"0110010011110***",
"0110010011111000"
]
},
{
"min": 25848,
"max": 47936,
"paths": [
10,
11,
13
],
"classes": [
19,
7
],
"prefixes": [
"0110010011111***",
"01100101********",
"0110011*********",
"01101***********",
"0111************",
"100*************",
"1010************",
"10110***********",
"1011100*********",
"10111010********",
"1011101100******",
"1011101101000000"
]
},
{
"min": 47936,
"max": 47944,
"paths": [
14
],
"classes": [
3
],
"prefixes": [
"1011101101000***",
"1011101101001000"
]
},
{
"min": 47944,
"max": 49152,
"paths": [
16,
15
],
"classes": [
10,
7
],
"prefixes": [
"1011101101001***",
"101110110101****",
"10111011011*****",
"101110111*******",
"101111**********",
"1100000000000000"
]
},
{
"min": 49152,
"max": 49160,
"paths": [
17,
18
],
"classes": [
16,
2
],
"prefixes": [
"1100000000000***",
"1100000000001000"
]
},
{
"min": 49160,
"max": 65536,
"paths": [
19,
20,
21,
22
],
"classes": [
17,
19,
15
],
"prefixes": [
"1100000000001***",
"110000000001****",
"11000000001*****",
"1100000001******",
"110000001*******",
"11000001********",
"1100001*********",
"110001**********",
"11001***********",
"1101************",
"111*************"
]
}
]
},
{
"id": "dst_meta",
"step": 0,
"match": "exact",
"method": "index",
"key_size": 4,
"data_size": 20
},
{
"id": "src_range",
"step": 1,
"match": "ternary",
"entries": 87,
"key_size": 16,
"ranges": [
{
"min": 0,
"max": 64,
"paths": [
0,
1,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
7,
11,
15,
16,
19
],
"prefixes": [
"0000000000******",
"0000000001000000"
]
},
{
"min": 64,
"max": 128,
"paths": [
3,
5,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
],
"prefixes": [
"0000000001******",
"0000000010000000"
]
},
{
"min": 128,
"max": 280,
"paths": [
3,
6,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
],
"prefixes": [
"000000001*******",
"000000010000****",
"0000000100010***",
"0000000100011000"
]
},
{
"min": 280,
"max": 816,
"paths": [
3,
6,
7,
8,
11,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
],
"prefixes": [
"0000000100011***",
"00000001001*****",
"0000000101******",
"000000011*******",
"00000010********",
"00000011000*****",
"000000110010****",
"0000001100110000"
]
},
{
"min": 816,
"max": 1576,
"paths": [
4,
6,
7,
8,
11,
14,
15,
17,
19
],
"classes": [
3,
7,
11,
15,
16,
19
],
"prefixes": [
"000000110011****",
"0000001101******",
"000000111*******",
"0000010*********",
"00000110000*****",
"0000011000100***",
"0000011000101000"
]
},
{
"min": 1576,
"max": 2488,
"paths": [
4,
6,
7,
8,
11,
14,
15,
18,
19
],
"classes": [
2,
3,
7,
11,
15,
19
],
"prefixes": [
"0000011000101***",
"000001100011****",
"0000011001******",
"000001101*******",
"00000111********",
"00001000********",
"000010010*******",
"00001001100*****",
"000010011010****",
"0000100110110***",
"0000100110111000"
]
},
{
"min": 2488,
"max": 4776,
"paths": [
4,
6,
7,
8,
11,
14,
16,
18,
19
],
"classes": [
2,
3,
7,
10,
11,
15,
19
],
"prefixes": [
"0000100110111***",
"0000100111******",
"0000101*********",
"000011**********",
"0001000*********",
"000100100*******",
"00010010100*****",
"0001001010100***",
"0001001010101000"
]
},
{
"min": 4776,
"max": 5224,
"paths": [
4,
6,
7,
8,
11,
14,
16,
18,
20
],
"classes": [
2,
3,
7,
10,
11,
17,
19
],
"prefixes": [
"0001001010101***",
"000100101011****",
"0001001011******",
"00010011********",
"0001010000******",
"00010100010*****",
"0001010001100***",
"0001010001101000"
]
},
{
"min": 5224,
"max": 9048,
"paths": [
4,
6,
7,
8,
12,
13,
14,
16,
18,
20
],
"classes": [
2,
3,
10,
11,
17,
19
],
"prefixes": [
"0001010001101***",
"000101000111****",
"000101001*******",
"00010101********",
"0001011*********",
"00011***********",
"0010000*********",
"00100010********",
"0010001100******",
"001000110100****",
"0010001101010***",
"0010001101011000"
]
},
{
"min": 9048,
"max": 43008,
"paths": [
4,
6,
7,
8,
12,
13,
14,
16,
18,
21
],
"classes": [
2,
3,
10,
11,
19
],
"prefixes": [
"0010001101011***",
"00100011011*****",
"001000111*******",
"001001**********",
"00101***********",
"0011************",
"01**************",
"100*************",
"10100***********",
"1010100000000000"
]
},
{
"min": 43008,
"max": 50384,
"paths": [
4,
6,
7,
9,
12,
13,
14,
16,
18,
21
],
"classes": [
1,
2,
3,
10,
11,
19
],
"prefixes": [
"10101***********",
"1011************",
"110000**********",
"110001000*******",
"1100010010******",
"110001001100****",
"1100010011010000"
]
},
{
"min": 50384,
"max": 65536,
"paths": [
4,
6,
7,
9,
12,
13,
14,
16,
18,
22
],
"classes": [
1,
2,
3,
10,
11,
19
],
"prefixes": [
"110001001101****",
"11000100111*****",
"11000101********",
"1100011*********",
"11001***********",
"1101************",
"111*************"
]
}
]
},
{
"id": "src_meta",
"step": 1,
"match": "exact",
"method": "index",
"key_size": 4,
"data_size": 20
},
{
"id": "protocl_range",
"step": 2,
"match": "ternary",
"entries": 2,
"key_size": 8,
"ranges": [
{
"min": 0,
"max": 0,
"paths": [
0,
2,
3,
4,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
1,
2,
3,
4,
7,
8,
10,
11,
15,
16,
17,
19
],
"prefixes": [
"00000000"
]
},
{
"min": 0,
"max": 256,
"paths": [
1,
2,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
1,
2,
3,
7,
8,
10,
11,
15,
16,
17,
19
],
"prefixes": [
"********"
]
}
]
},
{
"id": "protocl_meta",
"step": 2,
"match": "exact",
"method": "index",
"key_size": 1,
"data_size": 20
}
]

700
example/priority_aware.json Normal file
View File

@@ -0,0 +1,700 @@
[
{
"id": "dst_range",
"step": 0,
"match": "ternary",
"entries": 42,
"key_size": 16,
"ranges": [
{
"min": 0,
"max": 2136,
"paths": [
0,
1,
2,
3,
4,
5,
6
],
"classes": [
8,
19,
4
],
"prefixes": [
"00000***********",
"0000100000******",
"000010000100****",
"0000100001010***",
"0000100001011000"
],
"prefix_type": "exact"
},
{
"min": 2136,
"max": 2224,
"paths": [
7
],
"classes": [
11
],
"prefixes": [
"0000100001011***",
"00001000011*****",
"00001000100*****",
"000010001010****",
"0000100010110000"
],
"prefix_type": "exact"
},
{
"min": 2224,
"max": 5016,
"paths": [
8,
9
],
"classes": [
1,
19
],
"prefixes": [
"0000************",
"0001000*********",
"00010010********",
"000100110*******",
"000100111000****",
"0001001110010***",
"0001001110011000"
],
"prefix_type": "zero"
},
{
"min": 5016,
"max": 25848,
"paths": [
10,
11,
12
],
"classes": [
19,
7
],
"prefixes": [
"00**************",
"010*************",
"011000**********",
"011001000*******",
"0110010010******",
"01100100110*****",
"011001001110****",
"0110010011110***",
"0110010011111000"
],
"prefix_type": "zero"
},
{
"min": 25848,
"max": 47936,
"paths": [
10,
11,
13
],
"classes": [
19,
7
],
"prefixes": [
"0***************",
"100*************",
"1010************",
"10110***********",
"1011100*********",
"10111010********",
"1011101100******",
"1011101101000000"
],
"prefix_type": "zero"
},
{
"min": 47936,
"max": 47944,
"paths": [
14
],
"classes": [
3
],
"prefixes": [
"1011101101000***",
"1011101101001000"
],
"prefix_type": "exact"
},
{
"min": 47944,
"max": 49152,
"paths": [
16,
15
],
"classes": [
10,
7
],
"prefixes": [
"0***************",
"10**************",
"1100000000000000"
],
"prefix_type": "zero"
},
{
"min": 49152,
"max": 49160,
"paths": [
17,
18
],
"classes": [
16,
2
],
"prefixes": [
"1100000000000***",
"1100000000001000"
],
"prefix_type": "exact"
},
{
"min": 49160,
"max": 65536,
"paths": [
19,
20,
21,
22
],
"classes": [
17,
19,
15
],
"prefixes": [
"****************"
],
"prefix_type": "zero"
}
]
},
{
"id": "dst_meta",
"step": 0,
"match": "exact",
"method": "index",
"key_size": 4,
"data_size": 20
},
{
"id": "src_range",
"step": 1,
"match": "ternary",
"entries": 56,
"key_size": 16,
"ranges": [
{
"min": 0,
"max": 64,
"paths": [
0,
1,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
7,
11,
15,
16,
19
],
"prefixes": [
"0000000000******",
"0000000001000000"
],
"prefix_type": "exact"
},
{
"min": 64,
"max": 128,
"paths": [
3,
5,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
],
"prefixes": [
"0000000001******",
"0000000010000000"
],
"prefix_type": "exact"
},
{
"min": 128,
"max": 280,
"paths": [
3,
6,
7,
8,
10,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
],
"prefixes": [
"000000001*******",
"000000010000****",
"0000000100010***",
"0000000100011000"
],
"prefix_type": "exact"
},
{
"min": 280,
"max": 816,
"paths": [
3,
6,
7,
8,
11,
14,
15,
17,
19
],
"classes": [
3,
4,
7,
11,
15,
16,
19
],
"prefixes": [
"0000000*********",
"00000010********",
"00000011000*****",
"000000110010****",
"0000001100110000"
],
"prefix_type": "zero"
},
{
"min": 816,
"max": 1576,
"paths": [
4,
6,
7,
8,
11,
14,
15,
17,
19
],
"classes": [
3,
7,
11,
15,
16,
19
],
"prefixes": [
"000000**********",
"0000010*********",
"00000110000*****",
"0000011000100***",
"0000011000101000"
],
"prefix_type": "zero"
},
{
"min": 1576,
"max": 2488,
"paths": [
4,
6,
7,
8,
11,
14,
15,
18,
19
],
"classes": [
2,
3,
7,
11,
15,
19
],
"prefixes": [
"00000***********",
"00001000********",
"000010010*******",
"00001001100*****",
"000010011010****",
"0000100110110***",
"0000100110111000"
],
"prefix_type": "zero"
},
{
"min": 2488,
"max": 4776,
"paths": [
4,
6,
7,
8,
11,
14,
16,
18,
19
],
"classes": [
2,
3,
7,
10,
11,
15,
19
],
"prefixes": [
"0000************",
"0001000*********",
"000100100*******",
"00010010100*****",
"0001001010100***",
"0001001010101000"
],
"prefix_type": "zero"
},
{
"min": 4776,
"max": 5224,
"paths": [
4,
6,
7,
8,
11,
14,
16,
18,
20
],
"classes": [
2,
3,
7,
10,
11,
17,
19
],
"prefixes": [
"0000************",
"000100**********",
"0001010000******",
"00010100010*****",
"0001010001100***",
"0001010001101000"
],
"prefix_type": "zero"
},
{
"min": 5224,
"max": 9048,
"paths": [
4,
6,
7,
8,
12,
13,
14,
16,
18,
20
],
"classes": [
2,
3,
10,
11,
17,
19
],
"prefixes": [
"000*************",
"0010000*********",
"00100010********",
"0010001100******",
"001000110100****",
"0010001101010***",
"0010001101011000"
],
"prefix_type": "zero"
},
{
"min": 9048,
"max": 43008,
"paths": [
4,
6,
7,
8,
12,
13,
14,
16,
18,
21
],
"classes": [
2,
3,
10,
11,
19
],
"prefixes": [
"0***************",
"100*************",
"10100***********",
"1010100000000000"
],
"prefix_type": "zero"
},
{
"min": 43008,
"max": 50384,
"paths": [
4,
6,
7,
9,
12,
13,
14,
16,
18,
21
],
"classes": [
1,
2,
3,
10,
11,
19
],
"prefixes": [
"10101***********",
"1011************",
"110000**********",
"110001000*******",
"1100010010******",
"110001001100****",
"1100010011010000"
],
"prefix_type": "exact"
},
{
"min": 50384,
"max": 65536,
"paths": [
4,
6,
7,
9,
12,
13,
14,
16,
18,
22
],
"classes": [
1,
2,
3,
10,
11,
19
],
"prefixes": [
"****************"
],
"prefix_type": "zero"
}
]
},
{
"id": "src_meta",
"step": 1,
"match": "exact",
"method": "index",
"key_size": 4,
"data_size": 20
},
{
"id": "protocl_range",
"step": 2,
"match": "ternary",
"entries": 2,
"key_size": 8,
"ranges": [
{
"min": 0,
"max": 0,
"paths": [
0,
2,
3,
4,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
1,
2,
3,
4,
7,
8,
10,
11,
15,
16,
17,
19
],
"prefixes": [
"00000000"
],
"prefix_type": "exact"
},
{
"min": 0,
"max": 256,
"paths": [
1,
2,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22
],
"classes": [
1,
2,
3,
7,
8,
10,
11,
15,
16,
17,
19
],
"prefixes": [
"********"
],
"prefix_type": "exact"
}
]
},
{
"id": "protocl_meta",
"step": 2,
"match": "exact",
"method": "index",
"key_size": 1,
"data_size": 20
}
]

744
example/tree.json Normal file
View File

@@ -0,0 +1,744 @@
{
"features": {
"dst": [
47936.0,
2128.0,
5024.0,
2224.0,
25856.0,
47936.0,
49168.0,
49152.0
],
"src": [
64.0,
64.0,
816.0,
128.0,
43008.0,
5232.0,
288.0,
2480.0,
1584.0,
9040.0,
4784.0,
50384.0
],
"protocl": [
0.0,
0.0
]
},
"paths": [
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": "<=",
"value": 64.0
},
{
"feature": "src",
"operation": "<=",
"value": 64.0
},
{
"feature": "protocl",
"operation": "<=",
"value": 0.0
}
],
"classification": 19,
"id": 0
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": "<=",
"value": 64.0
},
{
"feature": "src",
"operation": "<=",
"value": 64.0
},
{
"feature": "protocl",
"operation": ">",
"value": 0.0
}
],
"classification": 19,
"id": 1
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": "<=",
"value": 64.0
},
{
"feature": "src",
"operation": ">",
"value": 64.0
}
],
"classification": 8,
"id": 2
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": ">",
"value": 64.0
},
{
"feature": "protocl",
"operation": "<=",
"value": 0.0
},
{
"feature": "src",
"operation": "<=",
"value": 816.0
}
],
"classification": 4,
"id": 3
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": ">",
"value": 64.0
},
{
"feature": "protocl",
"operation": "<=",
"value": 0.0
},
{
"feature": "src",
"operation": ">",
"value": 816.0
}
],
"classification": 19,
"id": 4
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": ">",
"value": 64.0
},
{
"feature": "protocl",
"operation": ">",
"value": 0.0
},
{
"feature": "src",
"operation": "<=",
"value": 128.0
}
],
"classification": 19,
"id": 5
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2128.0
},
{
"feature": "src",
"operation": ">",
"value": 64.0
},
{
"feature": "protocl",
"operation": ">",
"value": 0.0
},
{
"feature": "src",
"operation": ">",
"value": 128.0
}
],
"classification": 19,
"id": 6
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": "<=",
"value": 5024.0
},
{
"feature": "dst",
"operation": "<=",
"value": 2224.0
}
],
"classification": 11,
"id": 7
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": "<=",
"value": 5024.0
},
{
"feature": "dst",
"operation": ">",
"value": 2224.0
},
{
"feature": "src",
"operation": "<=",
"value": 43008.0
}
],
"classification": 19,
"id": 8
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": "<=",
"value": 5024.0
},
{
"feature": "dst",
"operation": ">",
"value": 2224.0
},
{
"feature": "src",
"operation": ">",
"value": 43008.0
}
],
"classification": 1,
"id": 9
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": ">",
"value": 5024.0
},
{
"feature": "src",
"operation": "<=",
"value": 5232.0
},
{
"feature": "src",
"operation": "<=",
"value": 288.0
}
],
"classification": 19,
"id": 10
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": ">",
"value": 5024.0
},
{
"feature": "src",
"operation": "<=",
"value": 5232.0
},
{
"feature": "src",
"operation": ">",
"value": 288.0
}
],
"classification": 7,
"id": 11
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": ">",
"value": 5024.0
},
{
"feature": "src",
"operation": ">",
"value": 5232.0
},
{
"feature": "dst",
"operation": "<=",
"value": 25856.0
}
],
"classification": 19,
"id": 12
},
{
"conditions": [
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 2128.0
},
{
"feature": "dst",
"operation": ">",
"value": 5024.0
},
{
"feature": "src",
"operation": ">",
"value": 5232.0
},
{
"feature": "dst",
"operation": ">",
"value": 25856.0
}
],
"classification": 19,
"id": 13
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 47936.0
}
],
"classification": 3,
"id": 14
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 49168.0
},
{
"feature": "dst",
"operation": "<=",
"value": 49152.0
},
{
"feature": "src",
"operation": "<=",
"value": 2480.0
}
],
"classification": 7,
"id": 15
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 49168.0
},
{
"feature": "dst",
"operation": "<=",
"value": 49152.0
},
{
"feature": "src",
"operation": ">",
"value": 2480.0
}
],
"classification": 10,
"id": 16
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 49168.0
},
{
"feature": "dst",
"operation": ">",
"value": 49152.0
},
{
"feature": "src",
"operation": "<=",
"value": 1584.0
}
],
"classification": 16,
"id": 17
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": "<=",
"value": 49168.0
},
{
"feature": "dst",
"operation": ">",
"value": 49152.0
},
{
"feature": "src",
"operation": ">",
"value": 1584.0
}
],
"classification": 2,
"id": 18
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 49168.0
},
{
"feature": "src",
"operation": "<=",
"value": 9040.0
},
{
"feature": "src",
"operation": "<=",
"value": 4784.0
}
],
"classification": 15,
"id": 19
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 49168.0
},
{
"feature": "src",
"operation": "<=",
"value": 9040.0
},
{
"feature": "src",
"operation": ">",
"value": 4784.0
}
],
"classification": 17,
"id": 20
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 49168.0
},
{
"feature": "src",
"operation": ">",
"value": 9040.0
},
{
"feature": "src",
"operation": "<=",
"value": 50384.0
}
],
"classification": 19,
"id": 21
},
{
"conditions": [
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 47936.0
},
{
"feature": "dst",
"operation": ">",
"value": 49168.0
},
{
"feature": "src",
"operation": ">",
"value": 9040.0
},
{
"feature": "src",
"operation": ">",
"value": 50384.0
}
],
"classification": 19,
"id": 22
}
],
"classes": [
"Amazon Echo",
"Belkin Motion Sensor",
"Belkin Switch",
"Dropcam",
"HP Printer",
"LiFX Bulb",
"NEST Smoke Sensor",
"Netatmo Camera",
"Netatmo Weather station",
"Pixstart photo frame",
"Samsung Smart Cam",
"Smart Things",
"TP-Link Camera",
"TP-Link Plug",
"Triby Speaker",
"Withings",
"Withings Scale",
"Withings sleep sensor",
"iHome PowerPlug",
"other"
]
}

View File

@@ -0,0 +1,47 @@
[
{
"id": "dst_range",
"step": 0,
"match": "ternary",
"entries": 288,
"key_size": 16
},
{
"id": "dst_meta",
"step": 0,
"match": "exact",
"method": "index",
"key_size": 4,
"data_size": 20
},
{
"id": "src_range",
"step": 1,
"match": "ternary",
"entries": 384,
"key_size": 16
},
{
"id": "src_meta",
"step": 1,
"match": "exact",
"method": "index",
"key_size": 4,
"data_size": 20
},
{
"id": "protocl_range",
"step": 2,
"match": "ternary",
"entries": 32,
"key_size": 8
},
{
"id": "protocl_meta",
"step": 2,
"match": "exact",
"method": "index",
"key_size": 1,
"data_size": 20
}
]

80
extract_all_datasets.py Normal file
View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
from pathlib import Path
import numpy as np
import pandas as pd
from labels import mac_to_label
from tqdm import tqdm
import os
ROOT = Path(__file__).resolve().parent
PCAP_DIR = ROOT / "data" / "pcap"
CSV_DIR = ROOT / "data" / "processed"
CSV_DIR.mkdir(parents=True, exist_ok=True)
BATCH = 100_000 # packets per chunk
from scapy.all import rdpcap
def process_pcap(pcap_path: str, csv_path: str) -> None:
all_packets = rdpcap(pcap_path)
print("rdpcap done", flush=True)
results = []
for packet in tqdm(all_packets):
size = len(packet)
try:
proto = packet.proto
except AttributeError:
proto = 0
try:
sport = packet.sport
dport = packet.dport
except AttributeError:
sport = 0
dport = 0
proto = int(proto)
sport = int(sport)
dport = int(dport)
if "Ether" in packet:
eth_dst = packet["Ether"].dst
if eth_dst in mac_to_label:
classification = mac_to_label[eth_dst]
else:
classification = "other"
else:
classification = "other"
metric = [proto,sport,dport,classification]
results.append(metric)
results = (np.array(results)).T
# store the features in the dataframe
dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
columns = ['protocl','src','dst','classfication']
# save the dataframe to the csv file, if not exsit, create one.
if os.path.exists(csv_path):
dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
else:
dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)
print("Done")
def main() -> None:
for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
csv_path = CSV_DIR / rel_csv
if csv_path.exists():
print(f"Skip {rel_csv} (CSV exists)")
continue
print(f"Processing {rel_csv}")
csv_path.parent.mkdir(parents=True, exist_ok=True)
process_pcap(str(pcap), str(csv_path))
if __name__ == "__main__":
main()

50
extract_tars.sh Normal file
View File

@@ -0,0 +1,50 @@
#!/usr/bin/env bash
# Usage: extract_all.sh SOURCE_DIR TARGET_DIR
# For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR:
# 1. Create TARGET_DIR/<name>/
# 2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive.
# 3. Otherwise, extract the archive into its own folder.
set -euo pipefail
if [[ $# -ne 2 ]]; then
echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2
exit 1
fi
src_dir="$1"
dst_dir="$2"
mkdir -p "$dst_dir"
# Strip common extensions to recover the base name
strip_ext() {
local n="$1"
n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar}
echo "$n"
}
shopt -s nullglob
for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do
base=$(basename "$archive")
name=$(strip_ext "$base")
out_dir="$dst_dir/$name"
key_file="$out_dir/$name.pcap"
if [[ -f "$key_file" ]]; then
echo "Skipping $archive$key_file already present"
continue
fi
echo "Extracting $archive into $out_dir"
mkdir -p "$out_dir"
case "$archive" in
*.tar) tar -xf "$archive" -C "$out_dir" ;;
*.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;;
*.tar.bz2) tar -xjf "$archive" -C "$out_dir" ;;
*.tar.xz) tar -xJf "$archive" -C "$out_dir" ;;
*) echo "Unknown type: $archive" ;;
esac
done
echo "All archives processed."

1
ideal-rmt-simulator Submodule

Submodule ideal-rmt-simulator added at 852153f017

View File

@@ -3,4 +3,5 @@ numpy
pandas
scikit-learn
pydotplus
matplotlib
matplotlib
scipy

44
sanity_check/csvdiff.py Normal file
View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""
csvdiff.py file1.csv file2.csv
Streams both files; prints the first differing line or
No differences found. Uses O(1) memory.
"""
import sys
from itertools import zip_longest
from pathlib import Path
def open_checked(p: str):
print(p)
path = Path(p)
try:
return path.open("r", newline=""), path
except FileNotFoundError:
sys.exit(f"Error: {path} not found")
def human(n: int) -> str:
return f"{n:,}"
def main(a_path: str, b_path: str) -> None:
fa, a = open_checked(a_path)
fb, b = open_checked(b_path)
with fa, fb:
for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
if ra != rb:
print(f"Files differ at line {human(idx)}")
if ra is None:
print(f"{a} ended early")
elif rb is None:
print(f"{b} ended early")
else:
print(f"{a}: {ra.rstrip()}")
print(f"{b}: {rb.rstrip()}")
return
print("No differences found")
if __name__ == "__main__":
if len(sys.argv) != 3:
sys.exit("Usage: csvdiff.py file1.csv file2.csv")
main(sys.argv[1], sys.argv[2])

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,206 @@
#!/usr/bin/env python3
"""diversity_metrics.py (fast version)
Estimate how much diversity each CSV adds without building a giant inmemory
DataFrame. Designed for IoT packet logs with millions of rows.
Quick summary printed as a GitHubstyle table (requires *tabulate*; falls back
to pandas plain text).
Usage
-----
python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000]
Metrics
-------
ΔEntropy : change in Shannon entropy of *classification* counts
ΔGini : change in Gini impurity of the same counts
χ² p : Pearson χ² pvalue old vs new classification counts
Jaccard : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new)
KS src p : KolmogorovSmirnov pvalue, sourceport dist (uses sampling)
KS dst p : KolmogorovSmirnov pvalue, destport dist (uses sampling)
Speed tricks
------------
* No growing DataFrame; we keep Counters / sets / lists.
* Ports for KS are *sampled* (default 50 k) to bound cost.
* (src,dst) pairs are hashed to a 32bit int to reduce set overhead.
* pandas reads via **pyarrow** engine when available.
"""
import argparse
from pathlib import Path
from collections import Counter
from typing import List, Set
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, ks_2samp, entropy
try:
from tabulate import tabulate
_USE_TABULATE = True
except ImportError:
_USE_TABULATE = False
# -----------------------------------------------------------------------------
# Helper metrics
# -----------------------------------------------------------------------------
def shannon(counts: Counter) -> float:
total = sum(counts.values())
if total == 0:
return 0.0
p = np.fromiter(counts.values(), dtype=float)
p /= total
return entropy(p, base=2)
def gini(counts: Counter) -> float:
total = sum(counts.values())
if total == 0:
return 0.0
return 1.0 - sum((n / total) ** 2 for n in counts.values())
def jaccard(a: Set[int], b: Set[int]) -> float:
if not a and not b:
return 1.0
return len(a & b) / len(a | b)
# -----------------------------------------------------------------------------
# Core analysis
# -----------------------------------------------------------------------------
def analyse(csv_files: List[Path], sample_size: int):
"""Return list of dicts with diversity metrics for each added file."""
# cumulative state (no big DataFrame!)
class_counter: Counter = Counter()
pair_hashes: Set[int] = set()
src_list: List[int] = []
dst_list: List[int] = []
rows = []
for csv_path in csv_files:
df = pd.read_csv(
csv_path,
engine="pyarrow" if pd.__version__ >= "2" else "c", # fast parse
usecols=["protocl", "src", "dst", "classfication"],
dtype={
"protocl": "uint16",
"protocol": "uint16",
"src": "uint16",
"dst": "uint16",
},
)
# normalise column names
df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True)
# snapshot previous state
prev_class = class_counter.copy()
prev_pairs = pair_hashes.copy()
prev_src = np.asarray(src_list, dtype=np.uint16)
prev_dst = np.asarray(dst_list, dtype=np.uint16)
# --- update cumulative structures ------------------------------------
class_counter.update(df["classification"].value_counts().to_dict())
# hash (src,dst) into 32bit int to save memory
pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \
df["dst"].to_numpy(dtype=np.uint32)
# extend port lists (keep small ints)
src_list.extend(df["src"].tolist())
dst_list.extend(df["dst"].tolist())
# --- metrics ----------------------------------------------------------
# χ² classification
chi_p = np.nan
if prev_class:
all_classes = list(set(prev_class) | set(df["classification"].unique()))
old = [prev_class.get(c, 0) for c in all_classes]
new = [df["classification"].value_counts().get(c, 0) for c in all_classes]
_, chi_p, _, _ = chi2_contingency([old, new])
# entropy & gini deltas
delta_entropy = shannon(class_counter) - shannon(prev_class)
delta_gini = gini(class_counter) - gini(prev_class)
# Jaccard on pair hashes
jc = jaccard(prev_pairs, pair_hashes)
# KS tests on sampled ports
ks_src_p = ks_dst_p = np.nan
if prev_src.size:
new_src = df["src"].to_numpy(dtype=np.uint16)
new_dst = df["dst"].to_numpy(dtype=np.uint16)
if prev_src.size > sample_size:
prev_src_sample = np.random.choice(prev_src, sample_size, replace=False)
else:
prev_src_sample = prev_src
if new_src.size > sample_size:
new_src_sample = np.random.choice(new_src, sample_size, replace=False)
else:
new_src_sample = new_src
if prev_dst.size > sample_size:
prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False)
else:
prev_dst_sample = prev_dst
if new_dst.size > sample_size:
new_dst_sample = np.random.choice(new_dst, sample_size, replace=False)
else:
new_dst_sample = new_dst
ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue
ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue
rows.append(
{
"File": csv_path.name,
"Rows": len(df),
"ΔEntropy": round(delta_entropy, 4),
"ΔGini": round(delta_gini, 4),
"χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA",
"Jaccard": round(jc, 3),
"KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA",
"KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA",
}
)
return rows
# -----------------------------------------------------------------------------
# CLI
# -----------------------------------------------------------------------------
def main():
ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).")
ap.add_argument("csv_dir", help="Directory containing CSV files")
ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir")
ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)")
args = ap.parse_args()
root = Path(args.csv_dir)
pattern = "**/*.csv" if args.recursive else "*.csv"
csv_files = sorted(root.glob(pattern))
if not csv_files:
print("No CSV files found.")
return
table_rows = analyse(csv_files, args.sample)
if _USE_TABULATE:
print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f"))
else:
print(pd.DataFrame(table_rows).to_string(index=False))
print(
"\nLegend:\n • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)"
"\n • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info"
"\n • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant."
)
if __name__ == "__main__":
main()

14
setup.sh Normal file
View File

@@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Creates the directory layout:
# data/
# tar/
# pcap/
# processed/
set -euo pipefail
root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
mkdir -p "$root"/data/{tar,pcap,processed,combined}
echo "Directory structure ready under $root/data/"