mirror of
https://github.com/ltcptgeneral/IdealRMT-DecisionTrees.git
synced 2025-10-24 13:09:20 +00:00
Compare commits
24 Commits
9729c6e68c
...
jai_runs
Author | SHA1 | Date | |
---|---|---|---|
|
c8a0b18abf | ||
2ad40946d1 | |||
50075b1acc | |||
|
1585399c7d | ||
8301998da3 | |||
3b2d6b3186 | |||
|
24fc2ed6f7 | ||
|
fda251f051 | ||
541538fcfe | |||
|
afc882a569 | ||
6de3807fe2 | |||
|
fc16d3c586 | ||
7bee40ecf9 | |||
|
e811171a73 | ||
61a451b82d | |||
c73de36c70 | |||
fadeab8a99 | |||
c208037ae9 | |||
ae3128f6e8 | |||
25e5a86a43 | |||
d3fe6efd47 | |||
23867747cd | |||
eeebc17d56 | |||
0d5e51f582 |
2
.gitattributes
vendored
Normal file
2
.gitattributes
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# force LF for any shell script
|
||||||
|
*.sh text eol=lf
|
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,4 +1,6 @@
|
|||||||
data.*
|
data.*
|
||||||
__pycache__
|
__pycache__
|
||||||
tree.json
|
*.json
|
||||||
compressed_tree.json
|
data/*
|
||||||
|
.DS_Store
|
||||||
|
.ipynb_checkpoints/
|
||||||
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
[submodule "ideal-rmt-simulator"]
|
||||||
|
path = ideal-rmt-simulator
|
||||||
|
url = https://github.com/rfchang/ideal-rmt-simulator
|
152
CompressedTreeParser.ipynb
Normal file
152
CompressedTreeParser.ipynb
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 138,
|
||||||
|
"id": "938dec51",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import argparse\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier, plot_tree, _tree\n",
|
||||||
|
"from sklearn.metrics import accuracy_score\n",
|
||||||
|
"from sklearn.tree import export_graphviz\n",
|
||||||
|
"import pydotplus\n",
|
||||||
|
"from matplotlib import pyplot as plt\n",
|
||||||
|
"from labels import mac_to_label\n",
|
||||||
|
"import json\n",
|
||||||
|
"import math"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 139,
|
||||||
|
"id": "442624c7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"Set1 = pd.read_csv('data/combined/data.csv').values.tolist()\n",
|
||||||
|
"X = [i[0:3] for i in Set1]\n",
|
||||||
|
"Y =[i[3] for i in Set1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 142,
|
||||||
|
"id": "12ad454d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'0': 20, '1': 20, '2': 9, '3': 20, '4': 0, '5': 13, '6': 20, '7': 0, '8': 12, '9': 4, '10': 20, '11': 4, '12': 1, '13': 16, '14': 20, '15': 2, '16': 20, '17': 0, '18': 20, '19': 20, '20': 20, '21': 20, '22': 20, '23': 1, '24': 2, '25': 20, '26': 13, '27': 11, '28': 20, '29': 20}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"predict_Yt = []\n",
|
||||||
|
"index=0\n",
|
||||||
|
"\n",
|
||||||
|
"with open('compressed_tree.json', 'r') as file:\n",
|
||||||
|
" data = json.load(file)\n",
|
||||||
|
" classes = data[\"classes\"]\n",
|
||||||
|
" for x in X:\n",
|
||||||
|
" counter = 0\n",
|
||||||
|
" class_set = []\n",
|
||||||
|
" paths_set = []\n",
|
||||||
|
" features = [\"protocol\", \"src\", \"dst\"]\n",
|
||||||
|
" for feature in features:\n",
|
||||||
|
" if feature in data[\"layers\"]:\n",
|
||||||
|
" for node in data['layers'][feature]:\n",
|
||||||
|
" if node['min'] is None:\n",
|
||||||
|
" if x[counter] <= node['max']:\n",
|
||||||
|
" class_set.append(node['classes'])\n",
|
||||||
|
" paths_set.append(node[\"paths\"])\n",
|
||||||
|
" break #is this an issue?\n",
|
||||||
|
" else:\n",
|
||||||
|
" continue\n",
|
||||||
|
" elif node['max'] is None:\n",
|
||||||
|
" if node['min'] < x[counter]:\n",
|
||||||
|
" class_set.append(node['classes'])\n",
|
||||||
|
" paths_set.append(node[\"paths\"])\n",
|
||||||
|
" break #is this an issue?\n",
|
||||||
|
" else:\n",
|
||||||
|
" continue\n",
|
||||||
|
" elif node['min'] < x[counter] and x[counter] <= node['max']:\n",
|
||||||
|
" class_set.append(node['classes'])\n",
|
||||||
|
" paths_set.append(node[\"paths\"])\n",
|
||||||
|
" break #is this an issue?\n",
|
||||||
|
"\n",
|
||||||
|
" counter += 1\n",
|
||||||
|
" result = set(class_set[0])\n",
|
||||||
|
" paths = set(paths_set[0])\n",
|
||||||
|
" for s in class_set[1:]:\n",
|
||||||
|
" result.intersection_update(s)\n",
|
||||||
|
" for s in paths_set[1:]:\n",
|
||||||
|
" paths.intersection_update(s)\n",
|
||||||
|
"\n",
|
||||||
|
" #predict_Yt.append(list(result))\n",
|
||||||
|
" #print(result)\n",
|
||||||
|
" if len(paths) != 1:\n",
|
||||||
|
" print(paths)\n",
|
||||||
|
" print(x)\n",
|
||||||
|
" print(result)\n",
|
||||||
|
" assert len(paths) == 1\n",
|
||||||
|
" path = list(paths)[0]\n",
|
||||||
|
" pred = data[\"path_to_class\"][str(path)]\n",
|
||||||
|
" pred_class = classes[pred]\n",
|
||||||
|
" predict_Yt.append(pred_class)\n",
|
||||||
|
" \n",
|
||||||
|
" index += 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 143,
|
||||||
|
"id": "8b4c56b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.8410252791654538\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"correct = 0\n",
|
||||||
|
"for i in range(len(Y)):\n",
|
||||||
|
" prediction = predict_Yt[i]\n",
|
||||||
|
" if prediction != None and Y[i] == prediction:\n",
|
||||||
|
" correct += 1\n",
|
||||||
|
"\n",
|
||||||
|
"print(correct / len(Y))"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
@@ -89,7 +89,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "switch",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -103,7 +103,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.7"
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
10
README.md
10
README.md
@@ -2,17 +2,21 @@
|
|||||||
|
|
||||||
Run `pip install -r requirements.txt`
|
Run `pip install -r requirements.txt`
|
||||||
|
|
||||||
|
Run `setup.sh`
|
||||||
|
|
||||||
# Tree Generation
|
# Tree Generation
|
||||||
|
|
||||||
## Download Dataset
|
## Download Dataset
|
||||||
|
|
||||||
Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
|
Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
|
||||||
|
|
||||||
Rename the file as data.pcap
|
Place these into the `data/tar` folder.
|
||||||
|
|
||||||
|
Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`.
|
||||||
|
|
||||||
## Preprocessing Dataset
|
## Preprocessing Dataset
|
||||||
|
|
||||||
Run `ExtractDataset.ipynb`, this will take a few minutes
|
Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree.
|
||||||
|
|
||||||
## Training
|
## Training
|
||||||
|
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 73,
|
||||||
"id": "ec310f34",
|
"id": "ec310f34",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -14,7 +14,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 74,
|
||||||
"id": "5b54797e",
|
"id": "5b54797e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -28,22 +28,25 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 75,
|
||||||
"id": "a38fdb8a",
|
"id": "a38fdb8a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# First cleanup the tree by rounding the decision points to integer values\n",
|
"# First cleanup the tree by rounding the decision points to integer values\n",
|
||||||
"# We assume all features will use integer values. If this is not the case, then training data should be normalized so that integer values can be accurate enough\n",
|
"# We assume all features will use integer values. If this is not the case, then training data should be normalized so that integer values can be accurate enough\n",
|
||||||
"# we also enumerate all the paths for later use\n",
|
|
||||||
"\n",
|
|
||||||
"i = 0\n",
|
"i = 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"path_ids = set()\n",
|
||||||
|
"path_classes = tree[\"classes\"]\n",
|
||||||
|
"\n",
|
||||||
"# for each path in the tree\n",
|
"# for each path in the tree\n",
|
||||||
"for path in paths:\n",
|
"for path in paths:\n",
|
||||||
"\t# assign a path id \n",
|
"\t# assign a path id \n",
|
||||||
"\tpath[\"id\"] = i\n",
|
"\tpath[\"id\"] = i\n",
|
||||||
"\ti += 1\n",
|
"\tpath_ids.add(i)\n",
|
||||||
|
"\t#path_classes.add(path[\"classification\"])\n",
|
||||||
|
"\ti += 1\t\n",
|
||||||
"\t# for each condition\n",
|
"\t# for each condition\n",
|
||||||
"\tconditions = path[\"conditions\"]\n",
|
"\tconditions = path[\"conditions\"]\n",
|
||||||
"\tfor condition in conditions:\n",
|
"\tfor condition in conditions:\n",
|
||||||
@@ -57,7 +60,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 76,
|
||||||
"id": "2fd4f738",
|
"id": "2fd4f738",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -80,7 +83,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 77,
|
||||||
"id": "98cde024",
|
"id": "98cde024",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -106,13 +109,13 @@
|
|||||||
"\t\tvalue = condition[\"value\"]\n",
|
"\t\tvalue = condition[\"value\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\t\t# move the min/max for the corresponding feature in compressed\n",
|
"\t\t# move the min/max for the corresponding feature in compressed\n",
|
||||||
"\t\tif operation == \"<=\" and compressed[feature][\"min\"] is None:\n",
|
"\t\tif operation == \"<=\" and compressed[feature][\"max\"] is None:\n",
|
||||||
"\t\t\tcompressed[feature][\"max\"] = value\n",
|
"\t\t\tcompressed[feature][\"max\"] = value\n",
|
||||||
"\t\telif operation == \">\" and compressed[feature][\"max\"] is None:\n",
|
"\t\telif operation == \">\" and compressed[feature][\"min\"] is None:\n",
|
||||||
"\t\t\tcompressed[feature][\"min\"] = value\n",
|
"\t\t\tcompressed[feature][\"min\"] = value\n",
|
||||||
"\t\telif operation == \"<=\" and value < compressed[feature][\"min\"]:\n",
|
"\t\telif operation == \"<=\" and value < compressed[feature][\"max\"]:\n",
|
||||||
"\t\t\tcompressed[feature][\"max\"] = value\n",
|
"\t\t\tcompressed[feature][\"max\"] = value\n",
|
||||||
"\t\telif operation == \">\" and value > compressed[feature][\"max\"]:\n",
|
"\t\telif operation == \">\" and value > compressed[feature][\"min\"]:\n",
|
||||||
"\t\t\tcompressed[feature][\"min\"] = value\n",
|
"\t\t\tcompressed[feature][\"min\"] = value\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\tpath[\"compressed\"] = compressed"
|
"\tpath[\"compressed\"] = compressed"
|
||||||
@@ -120,7 +123,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 78,
|
||||||
"id": "b6fbadbf",
|
"id": "b6fbadbf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -168,49 +171,64 @@
|
|||||||
"\t\t# for each bucket which encompases the condition's range, add this path's id to the sets \n",
|
"\t\t# for each bucket which encompases the condition's range, add this path's id to the sets \n",
|
||||||
"\t\ti = 0\n",
|
"\t\ti = 0\n",
|
||||||
"\t\tfor bp in breakpoints[feature_name]:\n",
|
"\t\tfor bp in breakpoints[feature_name]:\n",
|
||||||
"\t\t\tin_range = is_in_range(bp, lower, upper)\n",
|
"\t\t\tif is_in_range(bp, lower, upper):\n",
|
||||||
"\t\t\tif in_range:\n",
|
|
||||||
"\t\t\t\tbuckets_id[feature_name][i].add(ID)\n",
|
"\t\t\t\tbuckets_id[feature_name][i].add(ID)\n",
|
||||||
"\t\t\t\tbuckets_class[feature_name][i].add(Class)\n",
|
"\t\t\t\tbuckets_class[feature_name][i].add(Class)\n",
|
||||||
"\t\t\ti += 1"
|
"\t\t\ti += 1\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tif is_in_range(bp+1, lower, upper):\n",
|
||||||
|
"\t\t\tbuckets_id[feature_name][i].add(ID)\n",
|
||||||
|
"\t\t\tbuckets_class[feature_name][i].add(Class)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 79,
|
||||||
"id": "0a767971",
|
"id": "0a767971",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# combine breakpoints and buckets to one representation\n",
|
"# combine breakpoints and buckets to one representation\n",
|
||||||
"\n",
|
"\n",
|
||||||
"compressed_tree = defaultdict(list)\n",
|
"compressed_layers = defaultdict(list)\n",
|
||||||
"for feature_name in buckets_id:\n",
|
"for feature_name in buckets_id:\n",
|
||||||
"\tlower = None\n",
|
"\tlower = None\n",
|
||||||
"\tupper = breakpoints[feature_name][0]\n",
|
"\tupper = breakpoints[feature_name][0]\n",
|
||||||
"\tpaths = buckets_id[feature_name][0]\n",
|
"\tpaths = buckets_id[feature_name][0]\n",
|
||||||
"\tclasses = buckets_class[feature_name][0]\n",
|
"\tclasses = buckets_class[feature_name][0]\n",
|
||||||
"\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n",
|
"\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n",
|
||||||
"\tcompressed_tree[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
|
"\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
|
||||||
"\tfor i in range(1, len(buckets_id[feature_name]) - 1):\n",
|
"\tfor i in range(1, len(buckets_id[feature_name]) - 1):\n",
|
||||||
"\t\tlower = breakpoints[feature_name][i-1]\n",
|
"\t\tlower = breakpoints[feature_name][i-1]\n",
|
||||||
"\t\tupper = breakpoints[feature_name][i]\n",
|
"\t\tupper = breakpoints[feature_name][i]\n",
|
||||||
"\t\tmembers = buckets_id[feature_name][i]\n",
|
"\t\tpaths = buckets_id[feature_name][i]\n",
|
||||||
"\t\tclasses = buckets_class[feature_name][i]\n",
|
"\t\tclasses = buckets_class[feature_name][i]\n",
|
||||||
"\t\t#print(f\"{feature_name} = [{lower}, {upper}]: {buckets[feature_name][i]}\")\n",
|
"\t\t#print(f\"{feature_name} = [{lower}, {upper}]: {buckets[feature_name][i]}\")\n",
|
||||||
"\t\tcompressed_tree[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
|
"\t\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
|
||||||
"\tlower = breakpoints[feature_name][len(breakpoints[feature_name]) - 1]\n",
|
"\tlower = breakpoints[feature_name][len(breakpoints[feature_name]) - 1]\n",
|
||||||
"\tupper = None\n",
|
"\tupper = None\n",
|
||||||
"\tmembers = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n",
|
"\tpaths = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n",
|
||||||
"\tclasses = buckets_class[feature_name][len(buckets_class[feature_name]) - 1]\n",
|
"\tclasses = buckets_class[feature_name][len(buckets_class[feature_name]) - 1]\n",
|
||||||
"\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n",
|
"\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n",
|
||||||
"\tcompressed_tree[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
|
"\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
|
||||||
"\t#print(\"=\"*40)"
|
"\t#print(\"=\"*40)\n",
|
||||||
|
"\n",
|
||||||
|
"path_to_class = {}\n",
|
||||||
|
"for i in range(len(tree[\"paths\"])):\n",
|
||||||
|
" path = tree[\"paths\"][i]\n",
|
||||||
|
" path_to_class[path[\"id\"]] = path[\"classification\"]\n",
|
||||||
|
"\n",
|
||||||
|
"compressed_tree = {\n",
|
||||||
|
"\t\"paths\": path_ids,\n",
|
||||||
|
"\t\"classes\": path_classes,\n",
|
||||||
|
"\t\"layers\": compressed_layers,\n",
|
||||||
|
" \"path_to_class\": path_to_class,\n",
|
||||||
|
"}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 80,
|
||||||
"id": "561b0bc1",
|
"id": "561b0bc1",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -229,7 +247,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "switch",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -243,7 +261,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.7"
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
412
TreeToRMT.ipynb
Normal file
412
TreeToRMT.ipynb
Normal file
@@ -0,0 +1,412 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "58fc6db9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import math"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "e07be4b3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"f = open(\"compressed_tree.json\")\n",
|
||||||
|
"tree = json.loads(f.read())\n",
|
||||||
|
"layers = tree[\"layers\"]\n",
|
||||||
|
"classes = tree[\"classes\"]\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "1516ff91",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"field_width = {\n",
|
||||||
|
"\t\"src\": 16,\n",
|
||||||
|
"\t\"dst\": 16,\n",
|
||||||
|
"\t\"protocl\": 8,\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f9193827",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Worst Case RMT"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "5e37cfc5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def worst_case_rmt(tree):\n",
|
||||||
|
"\trmt = []\n",
|
||||||
|
"\tstep = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\ttcam_bits = 0\n",
|
||||||
|
"\tram_bits = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\tfor layer in layers:\n",
|
||||||
|
"\t\tnum_ranges = len(layers[layer])\n",
|
||||||
|
"\t\t# assume that each range requires all of 2*k prefixes when performing prefix expansion\n",
|
||||||
|
"\t\t# therefore there are 2*k * R for R ranges and width k\n",
|
||||||
|
"\t\tnum_prefixes = 2 * field_width[layer] * num_ranges\n",
|
||||||
|
"\t\tprefix_width = field_width[layer]\n",
|
||||||
|
"\n",
|
||||||
|
"\t\ttcam = {\n",
|
||||||
|
"\t\t\t\"id\": f\"{layer}_range\",\n",
|
||||||
|
"\t\t\t\"step\": step,\n",
|
||||||
|
"\t\t\t\"match\": \"ternary\",\n",
|
||||||
|
"\t\t\t\"entries\": num_prefixes,\n",
|
||||||
|
"\t\t\t\"key_size\": prefix_width\n",
|
||||||
|
"\t\t}\n",
|
||||||
|
"\t\ttcam_bits += num_prefixes * prefix_width\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t# assume basic pointer reuse for metadata storage\n",
|
||||||
|
"\t\tram = {\n",
|
||||||
|
"\t\t\t\"id\": f\"{layer}_meta\",\n",
|
||||||
|
"\t\t\t\"step\": step,\n",
|
||||||
|
"\t\t\t\"match\": \"exact\",\n",
|
||||||
|
"\t\t\t\"method\": \"index\",\n",
|
||||||
|
"\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
|
||||||
|
"\t\t\t\"data_size\": len(classes)\n",
|
||||||
|
"\t\t}\n",
|
||||||
|
"\t\tram_bits += num_ranges * len(classes)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\trmt.append(tcam)\n",
|
||||||
|
"\t\trmt.append(ram)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tstep += 1\n",
|
||||||
|
"\n",
|
||||||
|
"\treturn rmt, tcam_bits, ram_bits\n",
|
||||||
|
"\n",
|
||||||
|
"x, tcam_bits, ram_bits = worst_case_rmt(tree)\n",
|
||||||
|
"f = open(\"worst_case_rmt.json\", \"w+\")\n",
|
||||||
|
"f.write(json.dumps(x, indent=4))\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "0dc1d6d4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TCAM mapping: \n",
|
||||||
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"SRAM mapping: \n",
|
||||||
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"id mapping: \n",
|
||||||
|
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
||||||
|
"TCAM bits: 13184\n",
|
||||||
|
"RAM bits: 504\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n",
|
||||||
|
"print(f\"TCAM bits: {tcam_bits}\")\n",
|
||||||
|
"print(f\"RAM bits: {ram_bits}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2a628655",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Naive Range Expansion "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "fb9febe9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# shamelessly stolen from: https://github.com/autolyticus/range-to-prefix/blob/master/rangetoprefix.C\n",
|
||||||
|
"\n",
|
||||||
|
"def int_to_bin(i, width):\n",
|
||||||
|
"\treturn bin(i)[2:].zfill(width)\n",
|
||||||
|
"\n",
|
||||||
|
"def increment_dc(pfx):\n",
|
||||||
|
"\tidx = pfx.find(\"*\")\n",
|
||||||
|
"\tif idx == -1:\n",
|
||||||
|
"\t\tidx = len(pfx)\n",
|
||||||
|
"\tidx = idx - 1\n",
|
||||||
|
"\t#print(pfx, pfx[:idx])\n",
|
||||||
|
"\treturn pfx[:idx] + \"*\" + pfx[idx+1:]\n",
|
||||||
|
"\t\n",
|
||||||
|
"def can_merge(pfx_a, pfx_b):\n",
|
||||||
|
"\tpfx_a = pfx_a.replace(\"*\", \"\")\n",
|
||||||
|
"\tpfx_b = pfx_b.replace(\"*\", \"\")\n",
|
||||||
|
"\treturn pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1]\n",
|
||||||
|
"\n",
|
||||||
|
"def merge(pfx_a, prefixes):\n",
|
||||||
|
"\tpfx_a = increment_dc(pfx_a)\n",
|
||||||
|
"\tprefixes[-1] = pfx_a\n",
|
||||||
|
"\n",
|
||||||
|
"\tfor i in range(len(prefixes) - 2, -1, -1):\n",
|
||||||
|
"\t\tif can_merge(prefixes[i], prefixes[i+1]):\n",
|
||||||
|
"\t\t\tprefixes.pop()\n",
|
||||||
|
"\t\t\tpfx = increment_dc(prefixes[i])\n",
|
||||||
|
"\t\t\tprefixes[i] = pfx\n",
|
||||||
|
"\n",
|
||||||
|
"def convert_range(lower, upper, width):\n",
|
||||||
|
"\tprefixes = []\n",
|
||||||
|
"\tprefix = int_to_bin(lower, width)\n",
|
||||||
|
"\tprefixes.append(prefix)\n",
|
||||||
|
"\tnorm_upper = min(upper, 2**width-1)\n",
|
||||||
|
"\tfor i in range(lower+1, norm_upper+1):\n",
|
||||||
|
"\t\tprefix = int_to_bin(i, width)\n",
|
||||||
|
"\t\tif can_merge(prefix, prefixes[-1]):\n",
|
||||||
|
"\t\t\tmerge(prefix, prefixes)\n",
|
||||||
|
"\t\telse:\n",
|
||||||
|
"\t\t\tprefixes.append(prefix)\n",
|
||||||
|
"\treturn prefixes"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "55167c28",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def naive_rmt(tree):\n",
|
||||||
|
"\trmt = []\n",
|
||||||
|
"\tstep = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\ttcam_bits = 0\n",
|
||||||
|
"\tram_bits = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\tfor layer in layers:\n",
|
||||||
|
"\t\tnum_prefixes = 0\n",
|
||||||
|
"\t\tprefix_width = field_width[layer]\n",
|
||||||
|
"\t\t# for each range in the layer, convert the ranges to prefixes using naive range expansion\n",
|
||||||
|
"\t\tfor r in layers[layer]:\n",
|
||||||
|
"\t\t\tif r[\"min\"] == None:\n",
|
||||||
|
"\t\t\t\tr[\"min\"] = 0\n",
|
||||||
|
"\t\t\telif r[\"max\"] == None:\n",
|
||||||
|
"\t\t\t\tr[\"max\"] = 2 ** prefix_width\n",
|
||||||
|
"\t\t\tprefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n",
|
||||||
|
"\t\t\tr[\"prefixes\"] = prefixes\n",
|
||||||
|
"\t\t\tnum_prefixes += len(prefixes)\n",
|
||||||
|
"\t\t\ttcam_bits += len(prefixes) * prefix_width\n",
|
||||||
|
"\n",
|
||||||
|
"\t\ttcam = {\n",
|
||||||
|
"\t\t\t\"id\": f\"{layer}_range\",\n",
|
||||||
|
"\t\t\t\"step\": step,\n",
|
||||||
|
"\t\t\t\"match\": \"ternary\",\n",
|
||||||
|
"\t\t\t\"entries\": num_prefixes,\n",
|
||||||
|
"\t\t\t\"key_size\": prefix_width,\n",
|
||||||
|
"\t\t\t\"ranges\": layers[layer]\n",
|
||||||
|
"\t\t}\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tnum_ranges = len(layers[layer])\n",
|
||||||
|
"\t\t# assume no pointer reuse for metadata storage\n",
|
||||||
|
"\t\tram = {\n",
|
||||||
|
"\t\t\t\"id\": f\"{layer}_meta\",\n",
|
||||||
|
"\t\t\t\"step\": step,\n",
|
||||||
|
"\t\t\t\"match\": \"exact\",\n",
|
||||||
|
"\t\t\t\"method\": \"index\",\n",
|
||||||
|
"\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
|
||||||
|
"\t\t\t\"data_size\": len(classes)\n",
|
||||||
|
"\t\t}\n",
|
||||||
|
"\t\tram_bits += num_ranges * len(classes)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\trmt.append(tcam)\n",
|
||||||
|
"\t\trmt.append(ram)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tstep += 1\n",
|
||||||
|
"\n",
|
||||||
|
"\treturn rmt, tcam_bits, ram_bits\n",
|
||||||
|
"\n",
|
||||||
|
"x, tcam_bits, ram_bits = naive_rmt(tree)\n",
|
||||||
|
"f = open(\"naive_rmt.json\", \"w+\")\n",
|
||||||
|
"f.write(json.dumps(x, indent=4))\n",
|
||||||
|
"f.close()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "48011528",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TCAM mapping: \n",
|
||||||
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"SRAM mapping: \n",
|
||||||
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"id mapping: \n",
|
||||||
|
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
||||||
|
"TCAM bits: 3320\n",
|
||||||
|
"RAM bits: 504\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n",
|
||||||
|
"print(f\"TCAM bits: {tcam_bits}\")\n",
|
||||||
|
"print(f\"RAM bits: {ram_bits}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2504b1ba",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Priority Aware Prefix Expansion"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "64b7271e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# for this technique, we note that given disjoint ranges [0,a][a,b],[b,c] ...\n",
|
||||||
|
"# then if using a TCAM that selects the first matching prefix, then [0,a],[0,b],[0,c] would be equivalent\n",
|
||||||
|
"# this is because if for some k<a, even though the range [0,b] could be selected, as long as the prefixes for [0,a] are before [0,b] then the correct prefix will still be selected\n",
|
||||||
|
"\n",
|
||||||
|
"def priority_aware(tree):\n",
|
||||||
|
"\trmt = []\n",
|
||||||
|
"\tstep = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\ttcam_bits = 0\n",
|
||||||
|
"\tram_bits = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\tfor layer in layers:\n",
|
||||||
|
"\t\tnum_prefixes = 0\n",
|
||||||
|
"\t\tprefix_width = field_width[layer]\n",
|
||||||
|
"\t\t# for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0\n",
|
||||||
|
"\t\t# then check which set of prefixes would be better\n",
|
||||||
|
"\t\t# we will assume the ranges are already disjoin and in the correct order\n",
|
||||||
|
"\t\tfor r in layers[layer]:\n",
|
||||||
|
"\t\t\tif r[\"min\"] == None:\n",
|
||||||
|
"\t\t\t\tr[\"min\"] = 0\n",
|
||||||
|
"\t\t\telif r[\"max\"] == None:\n",
|
||||||
|
"\t\t\t\tr[\"max\"] = 2 ** prefix_width\n",
|
||||||
|
"\t\t\tregular_prefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n",
|
||||||
|
"\t\t\tzero_start_prefixes = convert_range(0, r[\"max\"], prefix_width)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\tif len(regular_prefixes) <= len(zero_start_prefixes):\n",
|
||||||
|
"\t\t\t\tpfx_type = \"exact\"\n",
|
||||||
|
"\t\t\t\tprefixes = regular_prefixes\n",
|
||||||
|
"\t\t\telse:\n",
|
||||||
|
"\t\t\t\tpfx_type = \"zero\"\n",
|
||||||
|
"\t\t\t\tprefixes = zero_start_prefixes\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\tr[\"prefixes\"] = prefixes\n",
|
||||||
|
"\t\t\tr[\"prefix_type\"] = pfx_type\n",
|
||||||
|
"\t\t\tnum_prefixes += len(prefixes)\n",
|
||||||
|
"\t\t\ttcam_bits += len(prefixes) * prefix_width\n",
|
||||||
|
"\n",
|
||||||
|
"\t\ttcam = {\n",
|
||||||
|
"\t\t\t\"id\": f\"{layer}_range\",\n",
|
||||||
|
"\t\t\t\"step\": step,\n",
|
||||||
|
"\t\t\t\"match\": \"ternary\",\n",
|
||||||
|
"\t\t\t\"entries\": num_prefixes,\n",
|
||||||
|
"\t\t\t\"key_size\": prefix_width,\n",
|
||||||
|
"\t\t\t\"ranges\": layers[layer]\n",
|
||||||
|
"\t\t}\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tnum_ranges = len(layers[layer])\n",
|
||||||
|
"\t\t# assume no pointer reuse for metadata storage\n",
|
||||||
|
"\t\tram = {\n",
|
||||||
|
"\t\t\t\"id\": f\"{layer}_meta\",\n",
|
||||||
|
"\t\t\t\"step\": step,\n",
|
||||||
|
"\t\t\t\"match\": \"exact\",\n",
|
||||||
|
"\t\t\t\"method\": \"index\",\n",
|
||||||
|
"\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
|
||||||
|
"\t\t\t\"data_size\": len(classes)\n",
|
||||||
|
"\t\t}\n",
|
||||||
|
"\t\tram_bits += num_ranges * len(classes)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\trmt.append(tcam)\n",
|
||||||
|
"\t\trmt.append(ram)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tstep += 1\n",
|
||||||
|
"\n",
|
||||||
|
"\treturn rmt, tcam_bits, ram_bits\n",
|
||||||
|
"\n",
|
||||||
|
"x, tcam_bits, ram_bits = priority_aware(tree)\n",
|
||||||
|
"f = open(\"priority_aware.json\", \"w+\")\n",
|
||||||
|
"f.write(json.dumps(x, indent=4))\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "cd706e41",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TCAM mapping: \n",
|
||||||
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"SRAM mapping: \n",
|
||||||
|
"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"id mapping: \n",
|
||||||
|
"[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
|
||||||
|
"TCAM bits: 2152\n",
|
||||||
|
"RAM bits: 504\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"! command python3 ideal-rmt-simulator/sim.py priority_aware.json\n",
|
||||||
|
"print(f\"TCAM bits: {tcam_bits}\")\n",
|
||||||
|
"print(f\"RAM bits: {ram_bits}\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
74
combine.py
Normal file
74
combine.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""combined.py
|
||||||
|
|
||||||
|
Concatenate every CSV that matches the pattern
|
||||||
|
data/processed/<name>/<name>.csv
|
||||||
|
into a single file:
|
||||||
|
data/combined/data.csv
|
||||||
|
|
||||||
|
The script streams each source CSV in 1‑Mio‑row chunks so memory stays low.
|
||||||
|
Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python combined.py
|
||||||
|
|
||||||
|
You can optionally supply a different root directory:
|
||||||
|
python combined.py --root other/processed_dir --out other/combined/data.csv
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
CHUNK = 1_000_000 # rows per read_csv chunk
|
||||||
|
|
||||||
|
|
||||||
|
def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Rename legacy columns to canonical names."""
|
||||||
|
return df.rename(
|
||||||
|
columns={"protocl": "protocol", "classfication": "classification"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def find_source_csvs(proc_root: Path):
|
||||||
|
"""Yield CSV paths that exactly match processed/<name>/<name>.csv."""
|
||||||
|
for sub in sorted(proc_root.iterdir()):
|
||||||
|
if not sub.is_dir():
|
||||||
|
continue
|
||||||
|
target = sub / f"{sub.name}.csv"
|
||||||
|
if target.exists():
|
||||||
|
yield target
|
||||||
|
|
||||||
|
|
||||||
|
def combine(proc_root: Path, out_path: Path):
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
first_write = True
|
||||||
|
for csv_path in find_source_csvs(proc_root):
|
||||||
|
print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
|
||||||
|
for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
|
||||||
|
chunk = fix_cols(chunk)
|
||||||
|
chunk.to_csv(
|
||||||
|
out_path,
|
||||||
|
mode="w" if first_write else "a",
|
||||||
|
header=first_write,
|
||||||
|
index=False,
|
||||||
|
)
|
||||||
|
first_write = False
|
||||||
|
print(f"✓ combined CSV written to {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
|
||||||
|
p.add_argument("--root", default="data/processed", help="processed dir root")
|
||||||
|
p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
combine(Path(args.root).expanduser(), Path(args.out).expanduser())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
80
extract_all_datasets.py
Normal file
80
extract_all_datasets.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from labels import mac_to_label
|
||||||
|
from tqdm import tqdm
|
||||||
|
import os
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent
|
||||||
|
PCAP_DIR = ROOT / "data" / "pcap"
|
||||||
|
CSV_DIR = ROOT / "data" / "processed"
|
||||||
|
CSV_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
BATCH = 100_000 # packets per chunk
|
||||||
|
|
||||||
|
from scapy.all import rdpcap
|
||||||
|
|
||||||
|
|
||||||
|
def process_pcap(pcap_path: str, csv_path: str) -> None:
|
||||||
|
all_packets = rdpcap(pcap_path)
|
||||||
|
|
||||||
|
print("rdpcap done", flush=True)
|
||||||
|
results = []
|
||||||
|
for packet in tqdm(all_packets):
|
||||||
|
size = len(packet)
|
||||||
|
try:
|
||||||
|
proto = packet.proto
|
||||||
|
except AttributeError:
|
||||||
|
proto = 0
|
||||||
|
try:
|
||||||
|
sport = packet.sport
|
||||||
|
dport = packet.dport
|
||||||
|
except AttributeError:
|
||||||
|
sport = 0
|
||||||
|
dport = 0
|
||||||
|
|
||||||
|
proto = int(proto)
|
||||||
|
sport = int(sport)
|
||||||
|
dport = int(dport)
|
||||||
|
|
||||||
|
if "Ether" in packet:
|
||||||
|
eth_dst = packet["Ether"].dst
|
||||||
|
if eth_dst in mac_to_label:
|
||||||
|
classification = mac_to_label[eth_dst]
|
||||||
|
else:
|
||||||
|
classification = "other"
|
||||||
|
else:
|
||||||
|
classification = "other"
|
||||||
|
|
||||||
|
metric = [proto,sport,dport,classification]
|
||||||
|
results.append(metric)
|
||||||
|
results = (np.array(results)).T
|
||||||
|
|
||||||
|
# store the features in the dataframe
|
||||||
|
dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
|
||||||
|
columns = ['protocl','src','dst','classfication']
|
||||||
|
|
||||||
|
# save the dataframe to the csv file, if not exsit, create one.
|
||||||
|
if os.path.exists(csv_path):
|
||||||
|
dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
|
||||||
|
else:
|
||||||
|
dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)
|
||||||
|
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
|
||||||
|
rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
|
||||||
|
csv_path = CSV_DIR / rel_csv
|
||||||
|
if csv_path.exists():
|
||||||
|
print(f"Skip {rel_csv} (CSV exists)")
|
||||||
|
continue
|
||||||
|
print(f"Processing {rel_csv}")
|
||||||
|
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
process_pcap(str(pcap), str(csv_path))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
50
extract_tars.sh
Normal file
50
extract_tars.sh
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: extract_all.sh SOURCE_DIR TARGET_DIR
|
||||||
|
# For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR:
|
||||||
|
# 1. Create TARGET_DIR/<name>/
|
||||||
|
# 2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive.
|
||||||
|
# 3. Otherwise, extract the archive into its own folder.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [[ $# -ne 2 ]]; then
|
||||||
|
echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
src_dir="$1"
|
||||||
|
dst_dir="$2"
|
||||||
|
mkdir -p "$dst_dir"
|
||||||
|
|
||||||
|
# Strip common extensions to recover the base name
|
||||||
|
strip_ext() {
|
||||||
|
local n="$1"
|
||||||
|
n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar}
|
||||||
|
echo "$n"
|
||||||
|
}
|
||||||
|
|
||||||
|
shopt -s nullglob
|
||||||
|
for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do
|
||||||
|
base=$(basename "$archive")
|
||||||
|
name=$(strip_ext "$base")
|
||||||
|
out_dir="$dst_dir/$name"
|
||||||
|
key_file="$out_dir/$name.pcap"
|
||||||
|
|
||||||
|
if [[ -f "$key_file" ]]; then
|
||||||
|
echo "Skipping $archive — $key_file already present"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Extracting $archive into $out_dir"
|
||||||
|
mkdir -p "$out_dir"
|
||||||
|
|
||||||
|
case "$archive" in
|
||||||
|
*.tar) tar -xf "$archive" -C "$out_dir" ;;
|
||||||
|
*.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;;
|
||||||
|
*.tar.bz2) tar -xjf "$archive" -C "$out_dir" ;;
|
||||||
|
*.tar.xz) tar -xJf "$archive" -C "$out_dir" ;;
|
||||||
|
*) echo "Unknown type: $archive" ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All archives processed."
|
1
ideal-rmt-simulator
Submodule
1
ideal-rmt-simulator
Submodule
Submodule ideal-rmt-simulator added at 852153f017
@@ -4,3 +4,4 @@ pandas
|
|||||||
scikit-learn
|
scikit-learn
|
||||||
pydotplus
|
pydotplus
|
||||||
matplotlib
|
matplotlib
|
||||||
|
scipy
|
168
run/decision_tree.py
Normal file
168
run/decision_tree.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Train a decision tree, optionally “nudge” its split thresholds, and
|
||||||
|
export the result as JSON.
|
||||||
|
|
||||||
|
Usage examples
|
||||||
|
--------------
|
||||||
|
# plain training, no nudging
|
||||||
|
python build_tree.py --input data/combined/data.csv --output tree.json
|
||||||
|
|
||||||
|
# nudge every internal threshold, keeping only the top-2 bits
|
||||||
|
python build_tree.py --input data/combined/data.csv --output tree.json \
|
||||||
|
--nudge --bits 2
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
from sklearn.tree import DecisionTreeClassifier, _tree
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# 1. command-line arguments
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--input", "-i", help="CSV file with protocol,src,dst,label", default="../data/combined/data.csv")
|
||||||
|
parser.add_argument("--output", "-o", help="Path for the exported JSON tree", default="tree.json")
|
||||||
|
parser.add_argument("--depth", "-d", type=int, default=5,
|
||||||
|
help="Max depth of the decision tree (default: 5)")
|
||||||
|
parser.add_argument("--nudge", action="store_true",
|
||||||
|
help="Enable threshold nudging")
|
||||||
|
parser.add_argument("--bits", type=int, default=2,
|
||||||
|
help="Number of bits to keep when nudging (default: 2)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# 2. helper functions
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
def nudge_threshold_max_n_bits(threshold: float, n_bits: int) -> int:
|
||||||
|
"""Remove n bits from each"""
|
||||||
|
threshold = math.floor(threshold)
|
||||||
|
if n_bits == 0:
|
||||||
|
return threshold
|
||||||
|
|
||||||
|
mask = pow(2, 32) - 1 ^ ((1 << n_bits) - 1)
|
||||||
|
nudged_value = threshold & mask
|
||||||
|
if threshold & (1 << (n_bits - 1)):
|
||||||
|
nudged_value += (1 << (n_bits))
|
||||||
|
|
||||||
|
return nudged_value
|
||||||
|
|
||||||
|
def apply_nudging(tree: _tree.Tree, node_idx: int, n_bits: int) -> None:
|
||||||
|
"""Post-order traversal that nudges every internal node’s threshold."""
|
||||||
|
flag = False
|
||||||
|
if tree.children_left[node_idx] != -1:
|
||||||
|
apply_nudging(tree, tree.children_left[node_idx], n_bits)
|
||||||
|
flag = True
|
||||||
|
if tree.children_right[node_idx] != -1:
|
||||||
|
apply_nudging(tree, tree.children_right[node_idx], n_bits)
|
||||||
|
flag = True
|
||||||
|
if flag: # internal node
|
||||||
|
tree.threshold[node_idx] = nudge_threshold_max_n_bits(
|
||||||
|
tree.threshold[node_idx], n_bits
|
||||||
|
)
|
||||||
|
|
||||||
|
# output the tree
|
||||||
|
def get_lineage(tree, feature_names):
|
||||||
|
data = {"features": {}, "paths": [], "classes": list(tree.classes_)}
|
||||||
|
|
||||||
|
thresholds = tree.tree_.threshold
|
||||||
|
features = [feature_names[i] for i in tree.tree_.feature]
|
||||||
|
left = tree.tree_.children_left
|
||||||
|
right = tree.tree_.children_right
|
||||||
|
value = tree.tree_.value
|
||||||
|
|
||||||
|
# -------- helper to climb up from a leaf to the root -----------
|
||||||
|
def recurse(left, right, child, lineage=None):
|
||||||
|
if lineage is None:
|
||||||
|
lineage = [child] # leaf marker (an int)
|
||||||
|
if child in left:
|
||||||
|
parent = np.where(left == child)[0].item()
|
||||||
|
split = "l"
|
||||||
|
elif child in right:
|
||||||
|
parent = np.where(right == child)[0].item()
|
||||||
|
split = "r"
|
||||||
|
else: # should never happen
|
||||||
|
return lineage
|
||||||
|
|
||||||
|
lineage.append((parent, split, thresholds[parent], features[parent]))
|
||||||
|
if parent == 0:
|
||||||
|
return list(reversed(lineage))
|
||||||
|
return recurse(left, right, parent, lineage)
|
||||||
|
|
||||||
|
leaf_ids = np.where(left == -1)[0] # indices of all leaves
|
||||||
|
for path_id, leaf in enumerate(leaf_ids):
|
||||||
|
clause = []
|
||||||
|
|
||||||
|
for node in recurse(left, right, leaf):
|
||||||
|
if not isinstance(node, tuple): # skip the leaf marker
|
||||||
|
continue
|
||||||
|
|
||||||
|
direction, threshold, feature = node[1], node[2], node[3]
|
||||||
|
if direction == "l":
|
||||||
|
clause.append(
|
||||||
|
{"feature": feature, "operation": "<=", "value": threshold}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
clause.append(
|
||||||
|
{"feature": feature, "operation": ">", "value": threshold}
|
||||||
|
)
|
||||||
|
|
||||||
|
class_idx = int(np.argmax(value[leaf][0])) # use the leaf itself
|
||||||
|
data["paths"].append(
|
||||||
|
{"conditions": clause, "classification": class_idx, "id": path_id}
|
||||||
|
)
|
||||||
|
|
||||||
|
# collect all thresholds per feature
|
||||||
|
for i, feat in enumerate(features):
|
||||||
|
if tree.tree_.feature[i] != _tree.TREE_UNDEFINED:
|
||||||
|
data["features"].setdefault(feat, []).append(thresholds[i])
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class SetEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, set):
|
||||||
|
return list(obj)
|
||||||
|
return super().default(obj)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# 3. load data
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
df = pd.read_csv(args.input)
|
||||||
|
X = df.iloc[:, :3].to_numpy()
|
||||||
|
Y = df.iloc[:, 3].to_numpy()
|
||||||
|
|
||||||
|
print(f"dataset size: {len(X)}")
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# 4. train the tree
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
dt = DecisionTreeClassifier(max_depth=args.depth)
|
||||||
|
dt.fit(X, Y)
|
||||||
|
print("train accuracy (before nudging):",
|
||||||
|
accuracy_score(Y, dt.predict(X)))
|
||||||
|
|
||||||
|
if args.nudge:
|
||||||
|
nudged_tree = copy.deepcopy(dt.tree_)
|
||||||
|
apply_nudging(nudged_tree, 0, args.bits)
|
||||||
|
dt.tree_ = nudged_tree
|
||||||
|
print(f"nudging enabled, removed bottom {args.bits} bit(s) per threshold")
|
||||||
|
|
||||||
|
print("train accuracy (after nudging):",
|
||||||
|
accuracy_score(Y, dt.predict(X)))
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# 5. export
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
lineage = get_lineage(dt, df.columns[:3])
|
||||||
|
|
||||||
|
output_path = Path(args.output)
|
||||||
|
output_path.write_text(json.dumps(lineage, indent=4, cls=SetEncoder))
|
||||||
|
print(f"Wrote tree to {output_path.resolve()}")
|
7
run/print.py
Normal file
7
run/print.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
for file in Path("results/compressed_tree/").glob("*.json"):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
s = json.load(f)
|
||||||
|
print(max(s["paths"])+1)
|
36
run/rmt.bat
Normal file
36
run/rmt.bat
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
@echo off
|
||||||
|
REM -------------------------------------------------------------
|
||||||
|
REM Batch-script to evaluate all compressed trees with every mode
|
||||||
|
REM -------------------------------------------------------------
|
||||||
|
setlocal EnableDelayedExpansion
|
||||||
|
|
||||||
|
REM --- where the trees live and where to store results ----------
|
||||||
|
set TREEDIR=results\compressed_tree
|
||||||
|
set OUTDIR=results\rmt
|
||||||
|
|
||||||
|
REM --- python executable (adjust if needed) ---------------------
|
||||||
|
set PY=python
|
||||||
|
|
||||||
|
REM --- which modes to run --------------------------------------
|
||||||
|
set MODELIST=naive priority
|
||||||
|
REM -------------------------------------------------------------
|
||||||
|
|
||||||
|
if not exist "%OUTDIR%" mkdir "%OUTDIR%"
|
||||||
|
|
||||||
|
for %%F in ("%TREEDIR%\*.json") do (
|
||||||
|
REM strip path → get file name without extension
|
||||||
|
set BASE=%%~nF
|
||||||
|
|
||||||
|
for %%M in (%MODELIST%) do (
|
||||||
|
echo Processing %%~nxF with mode %%M
|
||||||
|
|
||||||
|
"%PY%" tree_to_rmt.py ^
|
||||||
|
--mode %%M ^
|
||||||
|
--input "%%F" ^
|
||||||
|
--output "%OUTDIR%\!BASE!_%%M.json"
|
||||||
|
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
echo All runs complete.
|
||||||
|
pause
|
362
run/rmt.txt
Normal file
362
run/rmt.txt
Normal file
@@ -0,0 +1,362 @@
|
|||||||
|
Processing compressed_tree_d10_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d10_b0_naive.json
|
||||||
|
TCAM bits: 30336
|
||||||
|
RAM bits: 6888
|
||||||
|
Processing compressed_tree_d10_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d10_b0_priority.json
|
||||||
|
TCAM bits: 26648
|
||||||
|
RAM bits: 6888
|
||||||
|
Processing compressed_tree_d10_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d10_b1_naive.json
|
||||||
|
TCAM bits: 29936
|
||||||
|
RAM bits: 6531
|
||||||
|
Processing compressed_tree_d10_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d10_b1_priority.json
|
||||||
|
TCAM bits: 27120
|
||||||
|
RAM bits: 6531
|
||||||
|
Processing compressed_tree_d10_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d10_b3_naive.json
|
||||||
|
TCAM bits: 21712
|
||||||
|
RAM bits: 5649
|
||||||
|
Processing compressed_tree_d10_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d10_b3_priority.json
|
||||||
|
TCAM bits: 20048
|
||||||
|
RAM bits: 5649
|
||||||
|
Processing compressed_tree_d11_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d11_b0_naive.json
|
||||||
|
TCAM bits: 41248
|
||||||
|
RAM bits: 10332
|
||||||
|
Processing compressed_tree_d11_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d11_b0_priority.json
|
||||||
|
TCAM bits: 37592
|
||||||
|
RAM bits: 10332
|
||||||
|
Processing compressed_tree_d11_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d11_b1_naive.json
|
||||||
|
TCAM bits: 41072
|
||||||
|
RAM bits: 9744
|
||||||
|
Processing compressed_tree_d11_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d11_b1_priority.json
|
||||||
|
TCAM bits: 38256
|
||||||
|
RAM bits: 9744
|
||||||
|
Processing compressed_tree_d11_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d11_b3_naive.json
|
||||||
|
TCAM bits: 28464
|
||||||
|
RAM bits: 8190
|
||||||
|
Processing compressed_tree_d11_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d11_b3_priority.json
|
||||||
|
TCAM bits: 26928
|
||||||
|
RAM bits: 8190
|
||||||
|
Processing compressed_tree_d12_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d12_b0_naive.json
|
||||||
|
TCAM bits: 55680
|
||||||
|
RAM bits: 15393
|
||||||
|
Processing compressed_tree_d12_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d12_b0_priority.json
|
||||||
|
TCAM bits: 51592
|
||||||
|
RAM bits: 15393
|
||||||
|
Processing compressed_tree_d12_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d12_b1_naive.json
|
||||||
|
TCAM bits: 54240
|
||||||
|
RAM bits: 14175
|
||||||
|
Processing compressed_tree_d12_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d12_b1_priority.json
|
||||||
|
TCAM bits: 51200
|
||||||
|
RAM bits: 14175
|
||||||
|
Processing compressed_tree_d12_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d12_b3_naive.json
|
||||||
|
TCAM bits: 36048
|
||||||
|
RAM bits: 11361
|
||||||
|
Processing compressed_tree_d12_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d12_b3_priority.json
|
||||||
|
TCAM bits: 34416
|
||||||
|
RAM bits: 11361
|
||||||
|
Processing compressed_tree_d13_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d13_b0_naive.json
|
||||||
|
TCAM bits: 73152
|
||||||
|
RAM bits: 22680
|
||||||
|
Processing compressed_tree_d13_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d13_b0_priority.json
|
||||||
|
TCAM bits: 69096
|
||||||
|
RAM bits: 22680
|
||||||
|
Processing compressed_tree_d13_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d13_b1_naive.json
|
||||||
|
TCAM bits: 71024
|
||||||
|
RAM bits: 20643
|
||||||
|
Processing compressed_tree_d13_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d13_b1_priority.json
|
||||||
|
TCAM bits: 68160
|
||||||
|
RAM bits: 20643
|
||||||
|
Processing compressed_tree_d13_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d13_b3_naive.json
|
||||||
|
TCAM bits: 45152
|
||||||
|
RAM bits: 16002
|
||||||
|
Processing compressed_tree_d13_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d13_b3_priority.json
|
||||||
|
TCAM bits: 43600
|
||||||
|
RAM bits: 16002
|
||||||
|
Processing compressed_tree_d14_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d14_b0_naive.json
|
||||||
|
TCAM bits: 95760
|
||||||
|
RAM bits: 33012
|
||||||
|
Processing compressed_tree_d14_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d14_b0_priority.json
|
||||||
|
TCAM bits: 91656
|
||||||
|
RAM bits: 33012
|
||||||
|
Processing compressed_tree_d14_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d14_b1_naive.json
|
||||||
|
TCAM bits: 93520
|
||||||
|
RAM bits: 29862
|
||||||
|
Processing compressed_tree_d14_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d14_b1_priority.json
|
||||||
|
TCAM bits: 90544
|
||||||
|
RAM bits: 29862
|
||||||
|
Processing compressed_tree_d14_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d14_b3_naive.json
|
||||||
|
TCAM bits: 56144
|
||||||
|
RAM bits: 21819
|
||||||
|
Processing compressed_tree_d14_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d14_b3_priority.json
|
||||||
|
TCAM bits: 54544
|
||||||
|
RAM bits: 21819
|
||||||
|
Processing compressed_tree_d15_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d15_b0_naive.json
|
||||||
|
TCAM bits: 122496
|
||||||
|
RAM bits: 46662
|
||||||
|
Processing compressed_tree_d15_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d15_b0_priority.json
|
||||||
|
TCAM bits: 118792
|
||||||
|
RAM bits: 46662
|
||||||
|
Processing compressed_tree_d15_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d15_b1_naive.json
|
||||||
|
TCAM bits: 118640
|
||||||
|
RAM bits: 41349
|
||||||
|
Processing compressed_tree_d15_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d15_b1_priority.json
|
||||||
|
TCAM bits: 115984
|
||||||
|
RAM bits: 41349
|
||||||
|
Processing compressed_tree_d15_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d15_b3_naive.json
|
||||||
|
TCAM bits: 68928
|
||||||
|
RAM bits: 28875
|
||||||
|
Processing compressed_tree_d15_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d15_b3_priority.json
|
||||||
|
TCAM bits: 67328
|
||||||
|
RAM bits: 28875
|
||||||
|
Processing compressed_tree_d1_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d1_b0_naive.json
|
||||||
|
TCAM bits: 256
|
||||||
|
RAM bits: 42
|
||||||
|
Processing compressed_tree_d1_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d1_b0_priority.json
|
||||||
|
TCAM bits: 128
|
||||||
|
RAM bits: 42
|
||||||
|
Processing compressed_tree_d1_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d1_b1_naive.json
|
||||||
|
TCAM bits: 256
|
||||||
|
RAM bits: 42
|
||||||
|
Processing compressed_tree_d1_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d1_b1_priority.json
|
||||||
|
TCAM bits: 144
|
||||||
|
RAM bits: 42
|
||||||
|
Processing compressed_tree_d1_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d1_b3_naive.json
|
||||||
|
TCAM bits: 240
|
||||||
|
RAM bits: 42
|
||||||
|
Processing compressed_tree_d1_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d1_b3_priority.json
|
||||||
|
TCAM bits: 128
|
||||||
|
RAM bits: 42
|
||||||
|
Processing compressed_tree_d2_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d2_b0_naive.json
|
||||||
|
TCAM bits: 592
|
||||||
|
RAM bits: 105
|
||||||
|
Processing compressed_tree_d2_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d2_b0_priority.json
|
||||||
|
TCAM bits: 288
|
||||||
|
RAM bits: 105
|
||||||
|
Processing compressed_tree_d2_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d2_b1_naive.json
|
||||||
|
TCAM bits: 592
|
||||||
|
RAM bits: 105
|
||||||
|
Processing compressed_tree_d2_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d2_b1_priority.json
|
||||||
|
TCAM bits: 320
|
||||||
|
RAM bits: 105
|
||||||
|
Processing compressed_tree_d2_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d2_b3_naive.json
|
||||||
|
TCAM bits: 544
|
||||||
|
RAM bits: 105
|
||||||
|
Processing compressed_tree_d2_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d2_b3_priority.json
|
||||||
|
TCAM bits: 288
|
||||||
|
RAM bits: 105
|
||||||
|
Processing compressed_tree_d3_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d3_b0_naive.json
|
||||||
|
TCAM bits: 1120
|
||||||
|
RAM bits: 210
|
||||||
|
Processing compressed_tree_d3_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d3_b0_priority.json
|
||||||
|
TCAM bits: 640
|
||||||
|
RAM bits: 210
|
||||||
|
Processing compressed_tree_d3_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d3_b1_naive.json
|
||||||
|
TCAM bits: 1120
|
||||||
|
RAM bits: 210
|
||||||
|
Processing compressed_tree_d3_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d3_b1_priority.json
|
||||||
|
TCAM bits: 680
|
||||||
|
RAM bits: 210
|
||||||
|
Processing compressed_tree_d3_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d3_b3_naive.json
|
||||||
|
TCAM bits: 944
|
||||||
|
RAM bits: 210
|
||||||
|
Processing compressed_tree_d3_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d3_b3_priority.json
|
||||||
|
TCAM bits: 576
|
||||||
|
RAM bits: 210
|
||||||
|
Processing compressed_tree_d4_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d4_b0_naive.json
|
||||||
|
TCAM bits: 1880
|
||||||
|
RAM bits: 357
|
||||||
|
Processing compressed_tree_d4_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d4_b0_priority.json
|
||||||
|
TCAM bits: 1128
|
||||||
|
RAM bits: 357
|
||||||
|
Processing compressed_tree_d4_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d4_b1_naive.json
|
||||||
|
TCAM bits: 1880
|
||||||
|
RAM bits: 357
|
||||||
|
Processing compressed_tree_d4_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d4_b1_priority.json
|
||||||
|
TCAM bits: 1208
|
||||||
|
RAM bits: 357
|
||||||
|
Processing compressed_tree_d4_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d4_b3_naive.json
|
||||||
|
TCAM bits: 1632
|
||||||
|
RAM bits: 336
|
||||||
|
Processing compressed_tree_d4_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d4_b3_priority.json
|
||||||
|
TCAM bits: 1024
|
||||||
|
RAM bits: 336
|
||||||
|
Processing compressed_tree_d5_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d5_b0_naive.json
|
||||||
|
TCAM bits: 3608
|
||||||
|
RAM bits: 609
|
||||||
|
Processing compressed_tree_d5_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d5_b0_priority.json
|
||||||
|
TCAM bits: 2200
|
||||||
|
RAM bits: 609
|
||||||
|
Processing compressed_tree_d5_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d5_b1_naive.json
|
||||||
|
TCAM bits: 3608
|
||||||
|
RAM bits: 609
|
||||||
|
Processing compressed_tree_d5_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d5_b1_priority.json
|
||||||
|
TCAM bits: 2376
|
||||||
|
RAM bits: 609
|
||||||
|
Processing compressed_tree_d5_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d5_b3_naive.json
|
||||||
|
TCAM bits: 2704
|
||||||
|
RAM bits: 546
|
||||||
|
Processing compressed_tree_d5_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d5_b3_priority.json
|
||||||
|
TCAM bits: 1824
|
||||||
|
RAM bits: 546
|
||||||
|
Processing compressed_tree_d6_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d6_b0_naive.json
|
||||||
|
TCAM bits: 6440
|
||||||
|
RAM bits: 1134
|
||||||
|
Processing compressed_tree_d6_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d6_b0_priority.json
|
||||||
|
TCAM bits: 4512
|
||||||
|
RAM bits: 1134
|
||||||
|
Processing compressed_tree_d6_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d6_b1_naive.json
|
||||||
|
TCAM bits: 6440
|
||||||
|
RAM bits: 1134
|
||||||
|
Processing compressed_tree_d6_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d6_b1_priority.json
|
||||||
|
TCAM bits: 4776
|
||||||
|
RAM bits: 1134
|
||||||
|
Processing compressed_tree_d6_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d6_b3_naive.json
|
||||||
|
TCAM bits: 4832
|
||||||
|
RAM bits: 1008
|
||||||
|
Processing compressed_tree_d6_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d6_b3_priority.json
|
||||||
|
TCAM bits: 3648
|
||||||
|
RAM bits: 1008
|
||||||
|
Processing compressed_tree_d7_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d7_b0_naive.json
|
||||||
|
TCAM bits: 10344
|
||||||
|
RAM bits: 1848
|
||||||
|
Processing compressed_tree_d7_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d7_b0_priority.json
|
||||||
|
TCAM bits: 7808
|
||||||
|
RAM bits: 1848
|
||||||
|
Processing compressed_tree_d7_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d7_b1_naive.json
|
||||||
|
TCAM bits: 10312
|
||||||
|
RAM bits: 1806
|
||||||
|
Processing compressed_tree_d7_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d7_b1_priority.json
|
||||||
|
TCAM bits: 8136
|
||||||
|
RAM bits: 1806
|
||||||
|
Processing compressed_tree_d7_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d7_b3_naive.json
|
||||||
|
TCAM bits: 7760
|
||||||
|
RAM bits: 1596
|
||||||
|
Processing compressed_tree_d7_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d7_b3_priority.json
|
||||||
|
TCAM bits: 6352
|
||||||
|
RAM bits: 1596
|
||||||
|
Processing compressed_tree_d8_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d8_b0_naive.json
|
||||||
|
TCAM bits: 15672
|
||||||
|
RAM bits: 3003
|
||||||
|
Processing compressed_tree_d8_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d8_b0_priority.json
|
||||||
|
TCAM bits: 12640
|
||||||
|
RAM bits: 3003
|
||||||
|
Processing compressed_tree_d8_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d8_b1_naive.json
|
||||||
|
TCAM bits: 15576
|
||||||
|
RAM bits: 2919
|
||||||
|
Processing compressed_tree_d8_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d8_b1_priority.json
|
||||||
|
TCAM bits: 13160
|
||||||
|
RAM bits: 2919
|
||||||
|
Processing compressed_tree_d8_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d8_b3_naive.json
|
||||||
|
TCAM bits: 11504
|
||||||
|
RAM bits: 2625
|
||||||
|
Processing compressed_tree_d8_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d8_b3_priority.json
|
||||||
|
TCAM bits: 10016
|
||||||
|
RAM bits: 2625
|
||||||
|
Processing compressed_tree_d9_b0.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d9_b0_naive.json
|
||||||
|
TCAM bits: 22640
|
||||||
|
RAM bits: 4662
|
||||||
|
Processing compressed_tree_d9_b0.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d9_b0_priority.json
|
||||||
|
TCAM bits: 18936
|
||||||
|
RAM bits: 4662
|
||||||
|
Processing compressed_tree_d9_b1.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d9_b1_naive.json
|
||||||
|
TCAM bits: 22784
|
||||||
|
RAM bits: 4557
|
||||||
|
Processing compressed_tree_d9_b1.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d9_b1_priority.json
|
||||||
|
TCAM bits: 19872
|
||||||
|
RAM bits: 4557
|
||||||
|
Processing compressed_tree_d9_b3.json with mode naive
|
||||||
|
Output written to results\rmt\compressed_tree_d9_b3_naive.json
|
||||||
|
TCAM bits: 16560
|
||||||
|
RAM bits: 3948
|
||||||
|
Processing compressed_tree_d9_b3.json with mode priority
|
||||||
|
Output written to results\rmt\compressed_tree_d9_b3_priority.json
|
||||||
|
TCAM bits: 14880
|
||||||
|
RAM bits: 3948
|
||||||
|
All runs complete.
|
||||||
|
Press any key to continue . . .
|
24
run/run.bat
Normal file
24
run/run.bat
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
@echo off
|
||||||
|
REM --- settings --------------------------------------------------------
|
||||||
|
set INPUT=..\data\combined\data.csv
|
||||||
|
set OUTDIR=results\tree
|
||||||
|
set DEPTH_LIST=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
||||||
|
set BITS_LIST=0 1 3
|
||||||
|
set PY=python
|
||||||
|
REM ---------------------------------------------------------------------
|
||||||
|
|
||||||
|
if not exist "%OUTDIR%" mkdir "%OUTDIR%"
|
||||||
|
|
||||||
|
for %%D in (%DEPTH_LIST%) do (
|
||||||
|
for %%B in (%BITS_LIST%) do (
|
||||||
|
echo Running depth=%%D bits=%%B
|
||||||
|
%PY% decision_tree.py ^
|
||||||
|
--input "%INPUT%" ^
|
||||||
|
--output "%OUTDIR%\tree_d%%D_b%%B.json" ^
|
||||||
|
--depth %%D ^
|
||||||
|
--nudge --bits %%B
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
echo All runs complete
|
||||||
|
pause
|
272
run/run.txt
Normal file
272
run/run.txt
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
Running depth=1 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6249802762830571
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6249802762830571
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b0.json
|
||||||
|
Running depth=1 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6249802762830571
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6249802762830571
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b1.json
|
||||||
|
Running depth=1 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6249802762830571
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6249802762830571
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b3.json
|
||||||
|
Running depth=2 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6329657127591488
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6329657127591488
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b0.json
|
||||||
|
Running depth=2 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6329657127591488
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.632965582569598
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b1.json
|
||||||
|
Running depth=2 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6329657127591488
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.632991490290203
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b3.json
|
||||||
|
Running depth=3 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6770542739406867
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6770542739406867
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b0.json
|
||||||
|
Running depth=3 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6770542739406867
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6770412549856089
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b1.json
|
||||||
|
Running depth=3 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.6770542739406867
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.6785083610333301
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b3.json
|
||||||
|
Running depth=4 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.7785798611346175
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7785798611346175
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b0.json
|
||||||
|
Running depth=4 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.7785798611346175
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7762147075656273
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b1.json
|
||||||
|
Running depth=4 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.7785798611346175
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7764365505601536
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b3.json
|
||||||
|
Running depth=5 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8410252791654538
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8410252791654538
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b0.json
|
||||||
|
Running depth=5 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8410252791654538
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.834092425207405
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b1.json
|
||||||
|
Running depth=5 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8410252791654538
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.772544924508287
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b3.json
|
||||||
|
Running depth=6 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8646269522574087
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8646269522574087
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b0.json
|
||||||
|
Running depth=6 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8646269522574087
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8576925360247506
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b1.json
|
||||||
|
Running depth=6 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8646269522574087
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.794651761178205
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b3.json
|
||||||
|
Running depth=7 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8806056365826389
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8806056365826389
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b0.json
|
||||||
|
Running depth=7 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8806056365826389
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8736095105029118
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b1.json
|
||||||
|
Running depth=7 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8806056365826389
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7695685309983924
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b3.json
|
||||||
|
Running depth=8 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8930218140403702
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8930218140403702
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b0.json
|
||||||
|
Running depth=8 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8930218140403702
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8853817704424934
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b1.json
|
||||||
|
Running depth=8 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.8930218140403702
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7773965683075931
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b3.json
|
||||||
|
Running depth=9 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9065990219119429
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9065990219119429
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b0.json
|
||||||
|
Running depth=9 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9065990219119429
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.8971600191014109
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b1.json
|
||||||
|
Running depth=9 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9065990219119429
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7901483744272311
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b3.json
|
||||||
|
Running depth=10 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9131070673658019
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9131070673658019
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b0.json
|
||||||
|
Running depth=10 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9131070673658019
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9012124292484887
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b1.json
|
||||||
|
Running depth=10 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9131070673658019
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7823837394292594
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b3.json
|
||||||
|
Running depth=11 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9167131877328115
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9167131877328115
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b0.json
|
||||||
|
Running depth=11 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9167131877328115
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9033505322409215
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b1.json
|
||||||
|
Running depth=11 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9167131877328115
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7834850128392935
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b3.json
|
||||||
|
Running depth=12 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9190772997853955
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9190772997853955
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b0.json
|
||||||
|
Running depth=12 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9190772997853955
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9050692946902973
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b1.json
|
||||||
|
Running depth=12 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9190772997853955
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7733082258445005
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b3.json
|
||||||
|
Running depth=13 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9210431620021486
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9210431620021486
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b0.json
|
||||||
|
Running depth=13 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9210431620021486
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9069113466442602
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b1.json
|
||||||
|
Running depth=13 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9210431620021486
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7656775558942799
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b3.json
|
||||||
|
Running depth=14 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9232170671210456
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9232170671210456
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b0.json
|
||||||
|
Running depth=14 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9232169369314948
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9071005120615411
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b1.json
|
||||||
|
Running depth=14 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9232170671210456
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.7649352150757417
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b3.json
|
||||||
|
Running depth=15 bits=0
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9249752770043072
|
||||||
|
nudging enabled, removed bottom 0 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.9249752770043072
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b0.json
|
||||||
|
Running depth=15 bits=1
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9249752770043072
|
||||||
|
nudging enabled, removed bottom 1 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.908089692268355
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b1.json
|
||||||
|
Running depth=15 bits=3
|
||||||
|
dataset size: 7681108
|
||||||
|
train accuracy (before nudging): 0.9249752770043072
|
||||||
|
nudging enabled, removed bottom 3 bit(s) per threshold
|
||||||
|
train accuracy (after nudging): 0.762985496363285
|
||||||
|
Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b3.json
|
||||||
|
All runs complete
|
||||||
|
Press any key to continue . . .
|
173
run/tree_compress.py
Normal file
173
run/tree_compress.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Batch‑compress decision‑tree JSON files.
|
||||||
|
|
||||||
|
This script preserves the original logic but loops over every *.json file
|
||||||
|
in results/tree and drops a corresponding compressed file in
|
||||||
|
results/compressed_tree.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
$ python compress_trees_batch.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
INPUT_DIR = Path("results/tree")
|
||||||
|
OUTPUT_DIR = Path("results/compressed_tree")
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
class SetEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj): # type: ignore[override]
|
||||||
|
if isinstance(obj, set):
|
||||||
|
return list(obj)
|
||||||
|
return super().default(obj)
|
||||||
|
|
||||||
|
|
||||||
|
# helper function given a range and value x returns if x is in the range
|
||||||
|
|
||||||
|
def is_in_range(x: int, lower: int | None, upper: int | None) -> bool: # noqa: N803
|
||||||
|
if lower is None and upper is None:
|
||||||
|
return True
|
||||||
|
if lower is None:
|
||||||
|
return x <= upper # type: ignore[operator]
|
||||||
|
if upper is None:
|
||||||
|
return x > lower
|
||||||
|
return x <= upper and x > lower # type: ignore[operator]
|
||||||
|
|
||||||
|
|
||||||
|
for tree_path in INPUT_DIR.glob("*.json"):
|
||||||
|
with tree_path.open() as f:
|
||||||
|
tree = json.load(f)
|
||||||
|
|
||||||
|
paths = tree["paths"]
|
||||||
|
|
||||||
|
# First cleanup the tree by rounding the decision points to integer values
|
||||||
|
path_ids: set[int] = set()
|
||||||
|
path_classes = tree["classes"]
|
||||||
|
|
||||||
|
# assign ids and round thresholds
|
||||||
|
for idx, path in enumerate(paths):
|
||||||
|
path["id"] = idx
|
||||||
|
path_ids.add(idx)
|
||||||
|
for condition in path["conditions"]:
|
||||||
|
operation = condition["operation"]
|
||||||
|
if operation == "<=":
|
||||||
|
condition["value"] = math.floor(condition["value"])
|
||||||
|
else:
|
||||||
|
condition["value"] = math.floor(condition["value"])
|
||||||
|
|
||||||
|
# Find all breakpoints for each feature and create a set of disjoint ranges
|
||||||
|
breakpoints: dict[str, list[int]] = defaultdict(set) # type: ignore[assignment]
|
||||||
|
for path in paths:
|
||||||
|
for condition in path["conditions"]:
|
||||||
|
feature = condition["feature"]
|
||||||
|
value = condition["value"]
|
||||||
|
breakpoints[feature].add(value)
|
||||||
|
|
||||||
|
# sort breakpoint lists
|
||||||
|
for feature in breakpoints:
|
||||||
|
points = list(breakpoints[feature])
|
||||||
|
points.sort()
|
||||||
|
breakpoints[feature] = points # type: ignore[assignment]
|
||||||
|
|
||||||
|
# collapse all paths to ranges for each feature
|
||||||
|
for path in paths:
|
||||||
|
compressed: dict[str, dict[str, int | None]] = {}
|
||||||
|
for feature in breakpoints:
|
||||||
|
compressed[feature] = {"min": None, "max": None}
|
||||||
|
|
||||||
|
for condition in path["conditions"]:
|
||||||
|
feature = condition["feature"]
|
||||||
|
operation = condition["operation"]
|
||||||
|
value = condition["value"]
|
||||||
|
if operation == "<=" and compressed[feature]["max"] is None:
|
||||||
|
compressed[feature]["max"] = value
|
||||||
|
elif operation == ">" and compressed[feature]["min"] is None:
|
||||||
|
compressed[feature]["min"] = value
|
||||||
|
elif operation == "<=" and value < compressed[feature]["max"]: # type: ignore[operator]
|
||||||
|
compressed[feature]["max"] = value
|
||||||
|
elif operation == ">" and value > compressed[feature]["min"]: # type: ignore[operator]
|
||||||
|
compressed[feature]["min"] = value
|
||||||
|
|
||||||
|
path["compressed"] = compressed
|
||||||
|
|
||||||
|
# create buckets for each feature, where each is a list of sets
|
||||||
|
buckets_id: dict[str, list[set[int]]] = {}
|
||||||
|
buckets_class: dict[str, list[set[str]]] = {}
|
||||||
|
for feature in breakpoints:
|
||||||
|
num_points = len(breakpoints[feature])
|
||||||
|
buckets_id[feature] = [set() for _ in range(num_points + 1)]
|
||||||
|
buckets_class[feature] = [set() for _ in range(num_points + 1)]
|
||||||
|
|
||||||
|
# fill buckets
|
||||||
|
for path in paths:
|
||||||
|
for feature_name, feature in path["compressed"].items():
|
||||||
|
lower = feature["min"]
|
||||||
|
upper = feature["max"]
|
||||||
|
pid = path["id"]
|
||||||
|
cls = path["classification"]
|
||||||
|
|
||||||
|
for idx, bp in enumerate(breakpoints[feature_name]):
|
||||||
|
if is_in_range(bp, lower, upper):
|
||||||
|
buckets_id[feature_name][idx].add(pid)
|
||||||
|
buckets_class[feature_name][idx].add(cls)
|
||||||
|
# last bucket (> last breakpoint)
|
||||||
|
if is_in_range(bp + 1, lower, upper):
|
||||||
|
buckets_id[feature_name][-1].add(pid)
|
||||||
|
buckets_class[feature_name][-1].add(cls)
|
||||||
|
|
||||||
|
# combine breakpoints and buckets to one representation
|
||||||
|
compressed_layers: dict[str, list[dict[str, object]]] = defaultdict(list)
|
||||||
|
for feature_name in buckets_id:
|
||||||
|
lower = None
|
||||||
|
upper = breakpoints[feature_name][0]
|
||||||
|
compressed_layers[feature_name].append(
|
||||||
|
{
|
||||||
|
"min": lower,
|
||||||
|
"max": upper,
|
||||||
|
"paths": buckets_id[feature_name][0],
|
||||||
|
"classes": buckets_class[feature_name][0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
for i in range(1, len(buckets_id[feature_name]) - 1):
|
||||||
|
lower = breakpoints[feature_name][i - 1]
|
||||||
|
upper = breakpoints[feature_name][i]
|
||||||
|
compressed_layers[feature_name].append(
|
||||||
|
{
|
||||||
|
"min": lower,
|
||||||
|
"max": upper,
|
||||||
|
"paths": buckets_id[feature_name][i],
|
||||||
|
"classes": buckets_class[feature_name][i],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
lower = breakpoints[feature_name][-1]
|
||||||
|
upper = None
|
||||||
|
compressed_layers[feature_name].append(
|
||||||
|
{
|
||||||
|
"min": lower,
|
||||||
|
"max": upper,
|
||||||
|
"paths": buckets_id[feature_name][-1],
|
||||||
|
"classes": buckets_class[feature_name][-1],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
path_to_class = {path["id"]: path["classification"] for path in paths}
|
||||||
|
|
||||||
|
compressed_tree = {
|
||||||
|
"paths": list(path_ids),
|
||||||
|
"classes": path_classes,
|
||||||
|
"layers": compressed_layers,
|
||||||
|
"path_to_class": path_to_class,
|
||||||
|
}
|
||||||
|
|
||||||
|
out_path = OUTPUT_DIR / tree_path.name.replace("tree", "compressed_tree")
|
||||||
|
with out_path.open("w") as f_out:
|
||||||
|
json.dump(compressed_tree, f_out, indent=4, cls=SetEncoder)
|
||||||
|
|
||||||
|
# print(f"Wrote {out_path.relative_to(Path.cwd())}")
|
279
run/tree_to_rmt.py
Normal file
279
run/tree_to_rmt.py
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Range‑to‑Prefix evaluation tool
|
||||||
|
|
||||||
|
This script keeps the original logic intact while letting you choose
|
||||||
|
which expansion strategy to run via a command‑line flag.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
$ python rmt_selectable.py --mode naive
|
||||||
|
$ python rmt_selectable.py --mode priority --input mytree.json --output result.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Static configuration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
field_width = {
|
||||||
|
"src": 16,
|
||||||
|
"dst": 16,
|
||||||
|
"protocol": 8,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helper routines (unchanged)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def int_to_bin(i, width):
|
||||||
|
return bin(i)[2:].zfill(width)
|
||||||
|
|
||||||
|
|
||||||
|
def increment_dc(pfx):
|
||||||
|
idx = pfx.find("*")
|
||||||
|
if idx == -1:
|
||||||
|
idx = len(pfx)
|
||||||
|
idx -= 1
|
||||||
|
return pfx[:idx] + "*" + pfx[idx + 1 :]
|
||||||
|
|
||||||
|
|
||||||
|
def can_merge(pfx_a, pfx_b):
|
||||||
|
pfx_a = pfx_a.replace("*", "")
|
||||||
|
pfx_b = pfx_b.replace("*", "")
|
||||||
|
return pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def merge(pfx_a, prefixes):
|
||||||
|
pfx_a = increment_dc(pfx_a)
|
||||||
|
prefixes[-1] = pfx_a
|
||||||
|
|
||||||
|
for i in range(len(prefixes) - 2, -1, -1):
|
||||||
|
if can_merge(prefixes[i], prefixes[i + 1]):
|
||||||
|
prefixes.pop()
|
||||||
|
pfx = increment_dc(prefixes[i])
|
||||||
|
prefixes[i] = pfx
|
||||||
|
|
||||||
|
|
||||||
|
def convert_range(lower, upper, width):
|
||||||
|
prefixes = []
|
||||||
|
prefix = int_to_bin(lower, width)
|
||||||
|
prefixes.append(prefix)
|
||||||
|
norm_upper = min(upper, 2 ** width - 1)
|
||||||
|
for i in range(lower + 1, norm_upper + 1):
|
||||||
|
prefix = int_to_bin(i, width)
|
||||||
|
if can_merge(prefix, prefixes[-1]):
|
||||||
|
merge(prefix, prefixes)
|
||||||
|
else:
|
||||||
|
prefixes.append(prefix)
|
||||||
|
return prefixes
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# RMT construction strategies (logic preserved)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def worst_case_rmt(tree):
|
||||||
|
rmt = []
|
||||||
|
step = 0
|
||||||
|
|
||||||
|
tcam_bits = 0
|
||||||
|
ram_bits = 0
|
||||||
|
|
||||||
|
for layer in layers:
|
||||||
|
num_ranges = len(layers[layer])
|
||||||
|
# assume that each range requires all of 2*k prefixes when performing prefix expansion
|
||||||
|
# therefore there are 2*k * R for R ranges and width k
|
||||||
|
num_prefixes = 2 * field_width[layer] * num_ranges
|
||||||
|
prefix_width = field_width[layer]
|
||||||
|
|
||||||
|
tcam = {
|
||||||
|
"id": f"{layer}_range",
|
||||||
|
"step": step,
|
||||||
|
"match": "ternary",
|
||||||
|
"entries": num_prefixes,
|
||||||
|
"key_size": prefix_width,
|
||||||
|
}
|
||||||
|
tcam_bits += num_prefixes * prefix_width
|
||||||
|
|
||||||
|
# assume basic pointer reuse for metadata storage
|
||||||
|
ram = {
|
||||||
|
"id": f"{layer}_meta",
|
||||||
|
"step": step,
|
||||||
|
"match": "exact",
|
||||||
|
"method": "index",
|
||||||
|
"key_size": math.ceil(math.log2(num_ranges)),
|
||||||
|
"data_size": len(classes),
|
||||||
|
}
|
||||||
|
ram_bits += num_ranges * len(classes)
|
||||||
|
|
||||||
|
rmt.append(tcam)
|
||||||
|
rmt.append(ram)
|
||||||
|
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
return rmt, tcam_bits, ram_bits
|
||||||
|
|
||||||
|
|
||||||
|
def naive_rmt(tree):
|
||||||
|
rmt = []
|
||||||
|
step = 0
|
||||||
|
|
||||||
|
tcam_bits = 0
|
||||||
|
ram_bits = 0
|
||||||
|
|
||||||
|
for layer in layers:
|
||||||
|
num_prefixes = 0
|
||||||
|
prefix_width = field_width[layer]
|
||||||
|
# for each range in the layer, convert the ranges to prefixes using naive range expansion
|
||||||
|
for r in layers[layer]:
|
||||||
|
if r["min"] is None:
|
||||||
|
r["min"] = 0
|
||||||
|
elif r["max"] is None:
|
||||||
|
r["max"] = 2 ** prefix_width
|
||||||
|
prefixes = convert_range(r["min"], r["max"], prefix_width)
|
||||||
|
r["prefixes"] = prefixes
|
||||||
|
num_prefixes += len(prefixes)
|
||||||
|
tcam_bits += len(prefixes) * prefix_width
|
||||||
|
|
||||||
|
tcam = {
|
||||||
|
"id": f"{layer}_range",
|
||||||
|
"step": step,
|
||||||
|
"match": "ternary",
|
||||||
|
"entries": num_prefixes,
|
||||||
|
"key_size": prefix_width,
|
||||||
|
"ranges": layers[layer],
|
||||||
|
}
|
||||||
|
|
||||||
|
num_ranges = len(layers[layer])
|
||||||
|
# assume no pointer reuse for metadata storage
|
||||||
|
ram = {
|
||||||
|
"id": f"{layer}_meta",
|
||||||
|
"step": step,
|
||||||
|
"match": "exact",
|
||||||
|
"method": "index",
|
||||||
|
"key_size": math.ceil(math.log2(num_ranges)),
|
||||||
|
"data_size": len(classes),
|
||||||
|
}
|
||||||
|
ram_bits += num_ranges * len(classes)
|
||||||
|
|
||||||
|
rmt.append(tcam)
|
||||||
|
rmt.append(ram)
|
||||||
|
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
return rmt, tcam_bits, ram_bits
|
||||||
|
|
||||||
|
|
||||||
|
def priority_aware(tree):
|
||||||
|
rmt = []
|
||||||
|
step = 0
|
||||||
|
|
||||||
|
tcam_bits = 0
|
||||||
|
ram_bits = 0
|
||||||
|
|
||||||
|
for layer in layers:
|
||||||
|
num_prefixes = 0
|
||||||
|
prefix_width = field_width[layer]
|
||||||
|
# for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0
|
||||||
|
# then check which set of prefixes would be better
|
||||||
|
# we will assume the ranges are already disjoint and in the correct order
|
||||||
|
for r in layers[layer]:
|
||||||
|
if r["min"] is None:
|
||||||
|
r["min"] = 0
|
||||||
|
elif r["max"] is None:
|
||||||
|
r["max"] = 2 ** prefix_width
|
||||||
|
regular_prefixes = convert_range(r["min"], r["max"], prefix_width)
|
||||||
|
zero_start_prefixes = convert_range(0, r["max"], prefix_width)
|
||||||
|
|
||||||
|
if len(regular_prefixes) <= len(zero_start_prefixes):
|
||||||
|
pfx_type = "exact"
|
||||||
|
prefixes = regular_prefixes
|
||||||
|
else:
|
||||||
|
pfx_type = "zero"
|
||||||
|
prefixes = zero_start_prefixes
|
||||||
|
|
||||||
|
r["prefixes"] = prefixes
|
||||||
|
r["prefix_type"] = pfx_type
|
||||||
|
num_prefixes += len(prefixes)
|
||||||
|
tcam_bits += len(prefixes) * prefix_width
|
||||||
|
|
||||||
|
tcam = {
|
||||||
|
"id": f"{layer}_range",
|
||||||
|
"step": step,
|
||||||
|
"match": "ternary",
|
||||||
|
"entries": num_prefixes,
|
||||||
|
"key_size": prefix_width,
|
||||||
|
"ranges": layers[layer],
|
||||||
|
}
|
||||||
|
|
||||||
|
num_ranges = len(layers[layer])
|
||||||
|
# assume no pointer reuse for metadata storage
|
||||||
|
ram = {
|
||||||
|
"id": f"{layer}_meta",
|
||||||
|
"step": step,
|
||||||
|
"match": "exact",
|
||||||
|
"method": "index",
|
||||||
|
"key_size": math.ceil(math.log2(num_ranges)),
|
||||||
|
"data_size": len(classes),
|
||||||
|
}
|
||||||
|
ram_bits += num_ranges * len(classes)
|
||||||
|
|
||||||
|
rmt.append(tcam)
|
||||||
|
rmt.append(ram)
|
||||||
|
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
return rmt, tcam_bits, ram_bits
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Evaluate RMT memory usage for different range‑to‑prefix strategies.")
|
||||||
|
parser.add_argument("--mode", choices=["worst", "naive", "priority"], default="worst", help="Strategy to use")
|
||||||
|
parser.add_argument("--input", default="compressed_tree.json", help="Input tree JSON file")
|
||||||
|
parser.add_argument("--output", default=None, help="Output RMT JSON file (defaults to <mode>_rmt.json)")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# Keep the original variable names so the functions stay unchanged
|
||||||
|
global layers, classes
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(args.input) as f:
|
||||||
|
tree = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
sys.exit(f"Input file '{args.input}' not found.")
|
||||||
|
|
||||||
|
layers = tree["layers"]
|
||||||
|
classes = tree["classes"]
|
||||||
|
|
||||||
|
if args.mode == "worst":
|
||||||
|
rmt, tcam_bits, ram_bits = worst_case_rmt(tree)
|
||||||
|
default_out = "worst_case_rmt.json"
|
||||||
|
elif args.mode == "naive":
|
||||||
|
rmt, tcam_bits, ram_bits = naive_rmt(tree)
|
||||||
|
default_out = "naive_rmt.json"
|
||||||
|
else: # priority
|
||||||
|
rmt, tcam_bits, ram_bits = priority_aware(tree)
|
||||||
|
default_out = "priority_aware.json"
|
||||||
|
|
||||||
|
out_file = args.output or default_out
|
||||||
|
|
||||||
|
with open(out_file, "w") as f:
|
||||||
|
json.dump(rmt, f, indent=4)
|
||||||
|
|
||||||
|
#! command python3 ideal-rmt-simulator/sim.py {out_file}
|
||||||
|
print(f"Output written to {out_file}")
|
||||||
|
print(f"TCAM bits: {tcam_bits}")
|
||||||
|
print(f"RAM bits: {ram_bits}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
44
sanity_check/csvdiff.py
Normal file
44
sanity_check/csvdiff.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
csvdiff.py file1.csv file2.csv
|
||||||
|
Streams both files; prints the first differing line or
|
||||||
|
‘No differences found’. Uses O(1) memory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from itertools import zip_longest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def open_checked(p: str):
|
||||||
|
print(p)
|
||||||
|
path = Path(p)
|
||||||
|
try:
|
||||||
|
return path.open("r", newline=""), path
|
||||||
|
except FileNotFoundError:
|
||||||
|
sys.exit(f"Error: {path} not found")
|
||||||
|
|
||||||
|
def human(n: int) -> str:
|
||||||
|
return f"{n:,}"
|
||||||
|
|
||||||
|
def main(a_path: str, b_path: str) -> None:
|
||||||
|
fa, a = open_checked(a_path)
|
||||||
|
fb, b = open_checked(b_path)
|
||||||
|
|
||||||
|
with fa, fb:
|
||||||
|
for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
|
||||||
|
if ra != rb:
|
||||||
|
print(f"Files differ at line {human(idx)}")
|
||||||
|
if ra is None:
|
||||||
|
print(f"{a} ended early")
|
||||||
|
elif rb is None:
|
||||||
|
print(f"{b} ended early")
|
||||||
|
else:
|
||||||
|
print(f"{a}: {ra.rstrip()}")
|
||||||
|
print(f"{b}: {rb.rstrip()}")
|
||||||
|
return
|
||||||
|
print("No differences found")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
sys.exit("Usage: csvdiff.py file1.csv file2.csv")
|
||||||
|
main(sys.argv[1], sys.argv[2])
|
600
sanity_check/data_visualization.ipynb
Normal file
600
sanity_check/data_visualization.ipynb
Normal file
File diff suppressed because one or more lines are too long
206
sanity_check/diversity_metrics.py
Normal file
206
sanity_check/diversity_metrics.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""diversity_metrics.py (fast version)
|
||||||
|
|
||||||
|
Estimate how much diversity each CSV adds without building a giant in‑memory
|
||||||
|
DataFrame. Designed for IoT packet logs with millions of rows.
|
||||||
|
|
||||||
|
Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back
|
||||||
|
to pandas plain text).
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000]
|
||||||
|
|
||||||
|
Metrics
|
||||||
|
-------
|
||||||
|
ΔEntropy : change in Shannon entropy of *classification* counts
|
||||||
|
ΔGini : change in Gini impurity of the same counts
|
||||||
|
χ² p : Pearson χ² p‑value old vs new classification counts
|
||||||
|
Jaccard : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new)
|
||||||
|
KS src p : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling)
|
||||||
|
KS dst p : Kolmogorov–Smirnov p‑value, dest‑port dist (uses sampling)
|
||||||
|
|
||||||
|
Speed tricks
|
||||||
|
------------
|
||||||
|
* No growing DataFrame; we keep Counters / sets / lists.
|
||||||
|
* Ports for KS are *sampled* (default 50 k) to bound cost.
|
||||||
|
* (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead.
|
||||||
|
* pandas reads via **pyarrow** engine when available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
from typing import List, Set
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from scipy.stats import chi2_contingency, ks_2samp, entropy
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tabulate import tabulate
|
||||||
|
_USE_TABULATE = True
|
||||||
|
except ImportError:
|
||||||
|
_USE_TABULATE = False
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper metrics
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def shannon(counts: Counter) -> float:
|
||||||
|
total = sum(counts.values())
|
||||||
|
if total == 0:
|
||||||
|
return 0.0
|
||||||
|
p = np.fromiter(counts.values(), dtype=float)
|
||||||
|
p /= total
|
||||||
|
return entropy(p, base=2)
|
||||||
|
|
||||||
|
|
||||||
|
def gini(counts: Counter) -> float:
|
||||||
|
total = sum(counts.values())
|
||||||
|
if total == 0:
|
||||||
|
return 0.0
|
||||||
|
return 1.0 - sum((n / total) ** 2 for n in counts.values())
|
||||||
|
|
||||||
|
|
||||||
|
def jaccard(a: Set[int], b: Set[int]) -> float:
|
||||||
|
if not a and not b:
|
||||||
|
return 1.0
|
||||||
|
return len(a & b) / len(a | b)
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Core analysis
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def analyse(csv_files: List[Path], sample_size: int):
|
||||||
|
"""Return list of dicts with diversity metrics for each added file."""
|
||||||
|
|
||||||
|
# cumulative state (no big DataFrame!)
|
||||||
|
class_counter: Counter = Counter()
|
||||||
|
pair_hashes: Set[int] = set()
|
||||||
|
src_list: List[int] = []
|
||||||
|
dst_list: List[int] = []
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
for csv_path in csv_files:
|
||||||
|
df = pd.read_csv(
|
||||||
|
csv_path,
|
||||||
|
engine="pyarrow" if pd.__version__ >= "2" else "c", # fast parse
|
||||||
|
usecols=["protocl", "src", "dst", "classfication"],
|
||||||
|
dtype={
|
||||||
|
"protocl": "uint16",
|
||||||
|
"protocol": "uint16",
|
||||||
|
"src": "uint16",
|
||||||
|
"dst": "uint16",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# normalise column names
|
||||||
|
df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True)
|
||||||
|
|
||||||
|
# snapshot previous state
|
||||||
|
prev_class = class_counter.copy()
|
||||||
|
prev_pairs = pair_hashes.copy()
|
||||||
|
prev_src = np.asarray(src_list, dtype=np.uint16)
|
||||||
|
prev_dst = np.asarray(dst_list, dtype=np.uint16)
|
||||||
|
|
||||||
|
# --- update cumulative structures ------------------------------------
|
||||||
|
class_counter.update(df["classification"].value_counts().to_dict())
|
||||||
|
|
||||||
|
# hash (src,dst) into 32‑bit int to save memory
|
||||||
|
pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \
|
||||||
|
df["dst"].to_numpy(dtype=np.uint32)
|
||||||
|
|
||||||
|
|
||||||
|
# extend port lists (keep small ints)
|
||||||
|
src_list.extend(df["src"].tolist())
|
||||||
|
dst_list.extend(df["dst"].tolist())
|
||||||
|
|
||||||
|
# --- metrics ----------------------------------------------------------
|
||||||
|
# χ² classification
|
||||||
|
chi_p = np.nan
|
||||||
|
if prev_class:
|
||||||
|
all_classes = list(set(prev_class) | set(df["classification"].unique()))
|
||||||
|
old = [prev_class.get(c, 0) for c in all_classes]
|
||||||
|
new = [df["classification"].value_counts().get(c, 0) for c in all_classes]
|
||||||
|
_, chi_p, _, _ = chi2_contingency([old, new])
|
||||||
|
|
||||||
|
# entropy & gini deltas
|
||||||
|
delta_entropy = shannon(class_counter) - shannon(prev_class)
|
||||||
|
delta_gini = gini(class_counter) - gini(prev_class)
|
||||||
|
|
||||||
|
# Jaccard on pair hashes
|
||||||
|
jc = jaccard(prev_pairs, pair_hashes)
|
||||||
|
|
||||||
|
# KS tests on sampled ports
|
||||||
|
ks_src_p = ks_dst_p = np.nan
|
||||||
|
if prev_src.size:
|
||||||
|
new_src = df["src"].to_numpy(dtype=np.uint16)
|
||||||
|
new_dst = df["dst"].to_numpy(dtype=np.uint16)
|
||||||
|
if prev_src.size > sample_size:
|
||||||
|
prev_src_sample = np.random.choice(prev_src, sample_size, replace=False)
|
||||||
|
else:
|
||||||
|
prev_src_sample = prev_src
|
||||||
|
if new_src.size > sample_size:
|
||||||
|
new_src_sample = np.random.choice(new_src, sample_size, replace=False)
|
||||||
|
else:
|
||||||
|
new_src_sample = new_src
|
||||||
|
if prev_dst.size > sample_size:
|
||||||
|
prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False)
|
||||||
|
else:
|
||||||
|
prev_dst_sample = prev_dst
|
||||||
|
if new_dst.size > sample_size:
|
||||||
|
new_dst_sample = np.random.choice(new_dst, sample_size, replace=False)
|
||||||
|
else:
|
||||||
|
new_dst_sample = new_dst
|
||||||
|
|
||||||
|
ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue
|
||||||
|
ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"File": csv_path.name,
|
||||||
|
"Rows": len(df),
|
||||||
|
"ΔEntropy": round(delta_entropy, 4),
|
||||||
|
"ΔGini": round(delta_gini, 4),
|
||||||
|
"χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA",
|
||||||
|
"Jaccard": round(jc, 3),
|
||||||
|
"KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA",
|
||||||
|
"KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).")
|
||||||
|
ap.add_argument("csv_dir", help="Directory containing CSV files")
|
||||||
|
ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir")
|
||||||
|
ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
root = Path(args.csv_dir)
|
||||||
|
pattern = "**/*.csv" if args.recursive else "*.csv"
|
||||||
|
csv_files = sorted(root.glob(pattern))
|
||||||
|
if not csv_files:
|
||||||
|
print("No CSV files found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
table_rows = analyse(csv_files, args.sample)
|
||||||
|
|
||||||
|
if _USE_TABULATE:
|
||||||
|
print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f"))
|
||||||
|
else:
|
||||||
|
print(pd.DataFrame(table_rows).to_string(index=False))
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\nLegend:\n • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)"
|
||||||
|
"\n • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info"
|
||||||
|
"\n • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant."
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
14
setup.sh
Normal file
14
setup.sh
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Creates the directory layout:
|
||||||
|
# data/
|
||||||
|
# tar/
|
||||||
|
# pcap/
|
||||||
|
# processed/
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
mkdir -p "$root"/data/{tar,pcap,processed,combined}
|
||||||
|
|
||||||
|
echo "Directory structure ready under $root/data/"
|
Reference in New Issue
Block a user