Run data and code

update treetormt
Merge remote-tracking branch 'origin/decision-tree-nudge-boundaries'
2025-10-22 03:59:20 +00:00 · 2025-06-13 16:11:56 -07:00 · 2025-06-12 19:53:46 +00:00 · 2025-06-12 19:52:02 +00:00 · 2025-06-11 23:37:33 -07:00 · 2025-06-12 06:10:39 +00:00
24 changed files with 2760 additions and 90 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# force LF for any shell script
+*.sh text eol=lf
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 data.*
 __pycache__
-*.json
+*.json
+data/*
+.DS_Store
+.ipynb_checkpoints/
--- a/CompressedTreeParser.ipynb
+++ b/CompressedTreeParser.ipynb
@@ -0,0 +1,152 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "id": "938dec51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import argparse\n",
+    "from sklearn.tree import DecisionTreeClassifier, plot_tree, _tree\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.tree import export_graphviz\n",
+    "import pydotplus\n",
+    "from matplotlib import pyplot as plt\n",
+    "from labels import mac_to_label\n",
+    "import json\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "id": "442624c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Set1 = pd.read_csv('data/combined/data.csv').values.tolist()\n",
+    "X = [i[0:3] for i in Set1]\n",
+    "Y =[i[3] for i in Set1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "id": "12ad454d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'0': 20, '1': 20, '2': 9, '3': 20, '4': 0, '5': 13, '6': 20, '7': 0, '8': 12, '9': 4, '10': 20, '11': 4, '12': 1, '13': 16, '14': 20, '15': 2, '16': 20, '17': 0, '18': 20, '19': 20, '20': 20, '21': 20, '22': 20, '23': 1, '24': 2, '25': 20, '26': 13, '27': 11, '28': 20, '29': 20}\n"
+     ]
+    }
+   ],
+   "source": [
+    "predict_Yt = []\n",
+    "index=0\n",
+    "\n",
+    "with open('compressed_tree.json', 'r') as file:\n",
+    "    data = json.load(file)\n",
+    "    classes = data[\"classes\"]\n",
+    "    for x in X:\n",
+    "        counter = 0\n",
+    "        class_set = []\n",
+    "        paths_set = []\n",
+    "        features = [\"protocol\", \"src\", \"dst\"]\n",
+    "        for feature in features:\n",
+    "            if feature in data[\"layers\"]:\n",
+    "                for node in data['layers'][feature]:\n",
+    "                    if node['min'] is None:\n",
+    "                        if x[counter] <= node['max']:\n",
+    "                            class_set.append(node['classes'])\n",
+    "                            paths_set.append(node[\"paths\"])\n",
+    "                            break #is this an issue?\n",
+    "                        else:\n",
+    "                            continue\n",
+    "                    elif node['max'] is None:\n",
+    "                        if node['min'] < x[counter]:\n",
+    "                            class_set.append(node['classes'])\n",
+    "                            paths_set.append(node[\"paths\"])\n",
+    "                            break #is this an issue?\n",
+    "                        else:\n",
+    "                            continue\n",
+    "                    elif node['min'] < x[counter] and x[counter] <= node['max']:\n",
+    "                        class_set.append(node['classes'])\n",
+    "                        paths_set.append(node[\"paths\"])\n",
+    "                        break #is this an issue?\n",
+    "\n",
+    "            counter += 1\n",
+    "        result = set(class_set[0])\n",
+    "        paths = set(paths_set[0])\n",
+    "        for s in class_set[1:]:\n",
+    "            result.intersection_update(s)\n",
+    "        for s in paths_set[1:]:\n",
+    "            paths.intersection_update(s)\n",
+    "\n",
+    "        #predict_Yt.append(list(result))\n",
+    "        #print(result)\n",
+    "        if len(paths) != 1:\n",
+    "            print(paths)\n",
+    "            print(x)\n",
+    "            print(result)\n",
+    "        assert len(paths) == 1\n",
+    "        path = list(paths)[0]\n",
+    "        pred = data[\"path_to_class\"][str(path)]\n",
+    "        pred_class = classes[pred]\n",
+    "        predict_Yt.append(pred_class)\n",
+    "        \n",
+    "        index += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "id": "8b4c56b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.8410252791654538\n"
+     ]
+    }
+   ],
+   "source": [
+    "correct = 0\n",
+    "for i in range(len(Y)):\n",
+    "    prediction = predict_Yt[i]\n",
+    "    if prediction != None and Y[i] == prediction:\n",
+    "        correct += 1\n",
+    "\n",
+    "print(correct / len(Y))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/DecisionTree.ipynb
+++ b/DecisionTree.ipynb
--- a/ExtractDataset.ipynb
+++ b/ExtractDataset.ipynb
@@ -89,7 +89,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "switch",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -103,7 +103,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.9"
  }
 },
 "nbformat": 4,
--- a/README.md
+++ b/README.md
@@ -2,17 +2,21 @@

 Run `pip install -r requirements.txt`

+Run `setup.sh`
+
 # Tree Generation

 ## Download Dataset

-Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc
+Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc

-Rename the file as data.pcap
+Place these into the `data/tar` folder.
+
+Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`.

 ## Preprocessing Dataset

-Run `ExtractDataset.ipynb`, this will take a few minutes
+Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree.

 ## Training

--- a/TreeCompress.ipynb
+++ b/TreeCompress.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 73,
   "id": "ec310f34",
   "metadata": {},
   "outputs": [],
@@ -14,7 +14,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 74,
   "id": "5b54797e",
   "metadata": {},
   "outputs": [],
@@ -28,7 +28,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 75,
   "id": "a38fdb8a",
   "metadata": {},
   "outputs": [],
@@ -38,14 +38,14 @@
    "i = 0\n",
    "\n",
    "path_ids = set()\n",
-    "path_classes = set()\n",
+    "path_classes = tree[\"classes\"]\n",
    "\n",
    "# for each path in the tree\n",
    "for path in paths:\n",
    "\t# assign a path id \n",
    "\tpath[\"id\"] = i\n",
    "\tpath_ids.add(i)\n",
-    "\tpath_classes.add(path[\"classification\"])\n",
+    "\t#path_classes.add(path[\"classification\"])\n",
    "\ti += 1\t\n",
    "\t# for each condition\n",
    "\tconditions = path[\"conditions\"]\n",
@@ -60,7 +60,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 76,
   "id": "2fd4f738",
   "metadata": {},
   "outputs": [],
@@ -83,44 +83,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 77,
   "id": "98cde024",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'dst': {'min': None, 'max': 578}, 'src': {'min': None, 'max': 60}, 'protocl': {'min': None, 'max': 0}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': None, 'max': 60}, 'protocl': {'min': None, 'max': 0}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': None, 'max': 60}, 'protocl': {'min': 0, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': None, 'max': 60}, 'protocl': {'min': 1, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': None, 'max': 67}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 101}, 'src': {'min': 67, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 101}, 'src': {'min': 54978, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 101}, 'src': {'min': 59817, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 101}, 'src': {'min': 60043, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': 67, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': 130, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': None, 'max': 3031}, 'src': {'min': 1223, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 3031, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 3067, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 5110, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 33925, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 46329, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 46331, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 49152, 'max': None}, 'src': {'min': None, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 49157, 'max': None}, 'src': {'min': None, 'max': 283}, 'protocl': {'min': None, 'max': 11}}\n",
-      "{'dst': {'min': 49157, 'max': None}, 'src': {'min': None, 'max': 283}, 'protocl': {'min': 11, 'max': None}}\n",
-      "{'dst': {'min': 49157, 'max': None}, 'src': {'min': None, 'max': 4566}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 56320, 'max': None}, 'src': {'min': None, 'max': 4566}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 49157, 'max': None}, 'src': {'min': 4566, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 51848, 'max': None}, 'src': {'min': 4566, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 49157, 'max': None}, 'src': {'min': 5225, 'max': None}, 'protocl': {'min': None, 'max': None}}\n",
-      "{'dst': {'min': 53283, 'max': None}, 'src': {'min': 5225, 'max': None}, 'protocl': {'min': None, 'max': None}}\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# collapse all paths to ranges for each feature\n",
    "# because of how decision trees work, all conditions on a path must be true to reach the leaf node\n",
@@ -143,22 +109,21 @@
    "\t\tvalue = condition[\"value\"]\n",
    "\n",
    "\t\t# move the min/max for the corresponding feature in compressed\n",
-    "\t\tif operation == \"<=\" and compressed[feature][\"min\"] is None:\n",
+    "\t\tif operation == \"<=\" and compressed[feature][\"max\"] is None:\n",
    "\t\t\tcompressed[feature][\"max\"] = value\n",
-    "\t\telif operation == \">\" and compressed[feature][\"max\"] is None:\n",
+    "\t\telif operation == \">\" and compressed[feature][\"min\"] is None:\n",
    "\t\t\tcompressed[feature][\"min\"] = value\n",
-    "\t\telif operation == \"<=\" and value < compressed[feature][\"min\"]:\n",
+    "\t\telif operation == \"<=\" and value < compressed[feature][\"max\"]:\n",
    "\t\t\tcompressed[feature][\"max\"] = value\n",
-    "\t\telif operation == \">\" and value > compressed[feature][\"max\"]:\n",
+    "\t\telif operation == \">\" and value > compressed[feature][\"min\"]:\n",
    "\t\t\tcompressed[feature][\"min\"] = value\n",
    "\n",
-    "\tpath[\"compressed\"] = compressed\n",
-    "\tprint(compressed)"
+    "\tpath[\"compressed\"] = compressed"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 78,
   "id": "b6fbadbf",
   "metadata": {},
   "outputs": [],
@@ -218,7 +183,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 79,
   "id": "0a767971",
   "metadata": {},
   "outputs": [],
@@ -248,16 +213,22 @@
    "\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n",
    "\t#print(\"=\"*40)\n",
    "\n",
+    "path_to_class = {}\n",
+    "for i in range(len(tree[\"paths\"])):\n",
+    "    path = tree[\"paths\"][i]\n",
+    "    path_to_class[path[\"id\"]] = path[\"classification\"]\n",
+    "\n",
    "compressed_tree = {\n",
    "\t\"paths\": path_ids,\n",
    "\t\"classes\": path_classes,\n",
    "\t\"layers\": compressed_layers,\n",
+    "    \"path_to_class\": path_to_class,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 80,
   "id": "561b0bc1",
   "metadata": {},
   "outputs": [],
@@ -276,7 +247,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "switch",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -290,7 +261,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.9"
  }
 },
 "nbformat": 4,
--- a/TreeToRMT.ipynb
+++ b/TreeToRMT.ipynb
@@ -77,7 +77,7 @@
    "\t\t}\n",
    "\t\ttcam_bits += num_prefixes * prefix_width\n",
    "\n",
-    "\t\t# assume no pointer reuse for metadata storage\n",
+    "\t\t# assume basic pointer reuse for metadata storage\n",
    "\t\tram = {\n",
    "\t\t\t\"id\": f\"{layer}_meta\",\n",
    "\t\t\t\"step\": step,\n",
@@ -86,7 +86,7 @@
    "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
    "\t\t\t\"data_size\": len(classes)\n",
    "\t\t}\n",
-    "\t\tram_bits += math.ceil(math.log2(num_ranges)) * len(classes)\n",
+    "\t\tram_bits += num_ranges * len(classes)\n",
    "\n",
    "\t\trmt.append(tcam)\n",
    "\t\trmt.append(ram)\n",
@@ -117,8 +117,8 @@
      "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "id mapping: \n",
      "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
-      "TCAM bits: 13312\n",
-      "RAM bits:  110\n"
+      "TCAM bits: 13184\n",
+      "RAM bits:  504\n"
     ]
    }
   ],
@@ -182,9 +182,7 @@
    "\t\t\tmerge(prefix, prefixes)\n",
    "\t\telse:\n",
    "\t\t\tprefixes.append(prefix)\n",
-    "\treturn prefixes\n",
-    "\n",
-    "#convert_range(81, 1024, 16)"
+    "\treturn prefixes"
   ]
  },
  {
@@ -234,7 +232,7 @@
    "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
    "\t\t\t\"data_size\": len(classes)\n",
    "\t\t}\n",
-    "\t\tram_bits += math.ceil(math.log2(num_ranges)) * len(classes)\n",
+    "\t\tram_bits += num_ranges * len(classes)\n",
    "\n",
    "\t\trmt.append(tcam)\n",
    "\t\trmt.append(ram)\n",
@@ -265,8 +263,8 @@
      "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "id mapping: \n",
      "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
-      "TCAM bits: 3520\n",
-      "RAM bits:  110\n"
+      "TCAM bits: 3320\n",
+      "RAM bits:  504\n"
     ]
    }
   ],
@@ -276,6 +274,14 @@
    "print(f\"RAM bits:  {ram_bits}\")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "2504b1ba",
+   "metadata": {},
+   "source": [
+    "# Priority Aware Prefix Expansion"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 9,
@@ -339,7 +345,7 @@
    "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n",
    "\t\t\t\"data_size\": len(classes)\n",
    "\t\t}\n",
-    "\t\tram_bits += math.ceil(math.log2(num_ranges)) * len(classes)\n",
+    "\t\tram_bits += num_ranges * len(classes)\n",
    "\n",
    "\t\trmt.append(tcam)\n",
    "\t\trmt.append(ram)\n",
@@ -370,8 +376,8 @@
      "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "id mapping: \n",
      "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n",
-      "TCAM bits: 2120\n",
-      "RAM bits:  110\n"
+      "TCAM bits: 2152\n",
+      "RAM bits:  504\n"
     ]
    }
   ],
@@ -384,7 +390,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "switch",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
--- a/combine.py
+++ b/combine.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""combined.py
+
+Concatenate every CSV that matches the pattern
+    data/processed/<name>/<name>.csv
+into a single file:
+    data/combined/data.csv
+
+The script streams each source CSV in 1‑Mio‑row chunks so memory stays low.
+Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly.
+
+Usage
+-----
+python combined.py
+
+You can optionally supply a different root directory:
+python combined.py --root other/processed_dir --out other/combined/data.csv
+"""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+import os
+import pandas as pd
+
+CHUNK = 1_000_000  # rows per read_csv chunk
+
+
+def fix_cols(df: pd.DataFrame) -> pd.DataFrame:
+    """Rename legacy columns to canonical names."""
+    return df.rename(
+        columns={"protocl": "protocol", "classfication": "classification"}
+    )
+
+
+def find_source_csvs(proc_root: Path):
+    """Yield CSV paths that exactly match processed/<name>/<name>.csv."""
+    for sub in sorted(proc_root.iterdir()):
+        if not sub.is_dir():
+            continue
+        target = sub / f"{sub.name}.csv"
+        if target.exists():
+            yield target
+
+
+def combine(proc_root: Path, out_path: Path):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    first_write = True
+    for csv_path in find_source_csvs(proc_root):
+        print(f"→ adding {csv_path.relative_to(proc_root.parent)}")
+        for chunk in pd.read_csv(csv_path, chunksize=CHUNK):
+            chunk = fix_cols(chunk)
+            chunk.to_csv(
+                out_path,
+                mode="w" if first_write else "a",
+                header=first_write,
+                index=False,
+            )
+            first_write = False
+    print(f"✓ combined CSV written to {out_path}")
+
+
+def main():
+    p = argparse.ArgumentParser(description="Combine processed CSVs into one.")
+    p.add_argument("--root", default="data/processed", help="processed dir root")
+    p.add_argument("--out", default="data/combined/data.csv", help="output CSV")
+    args = p.parse_args()
+
+    combine(Path(args.root).expanduser(), Path(args.out).expanduser())
+
+
+if __name__ == "__main__":
+    main()
--- a/extract_all_datasets.py
+++ b/extract_all_datasets.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from labels import mac_to_label
+from tqdm import tqdm
+import os
+
+ROOT       = Path(__file__).resolve().parent
+PCAP_DIR   = ROOT / "data" / "pcap"
+CSV_DIR    = ROOT / "data" / "processed"
+CSV_DIR.mkdir(parents=True, exist_ok=True)
+
+BATCH = 100_000   # packets per chunk
+
+from scapy.all import rdpcap
+
+
+def process_pcap(pcap_path: str, csv_path: str) -> None:
+    all_packets = rdpcap(pcap_path)
+
+    print("rdpcap done", flush=True)
+    results = []
+    for packet in tqdm(all_packets):
+        size = len(packet)
+        try:
+            proto = packet.proto
+        except AttributeError:
+            proto = 0
+        try:
+            sport = packet.sport
+            dport = packet.dport
+        except AttributeError:
+            sport = 0
+            dport = 0
+
+        proto = int(proto)
+        sport = int(sport)
+        dport = int(dport)
+
+        if "Ether" in packet:
+            eth_dst = packet["Ether"].dst
+            if eth_dst in mac_to_label:
+                classification = mac_to_label[eth_dst]
+            else:
+                classification = "other"
+        else:
+            classification = "other"
+
+        metric = [proto,sport,dport,classification]
+        results.append(metric)
+    results = (np.array(results)).T
+
+    # store the features in the dataframe
+    dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
+    columns = ['protocl','src','dst','classfication']
+
+    # save the dataframe to the csv file, if not exsit, create one.
+    if os.path.exists(csv_path):
+        dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
+    else:
+        dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)
+        
+    print("Done")
+
+
+
+def main() -> None:
+    for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
+        rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
+        csv_path = CSV_DIR / rel_csv
+        if csv_path.exists():
+            print(f"Skip {rel_csv} (CSV exists)")
+            continue
+        print(f"Processing {rel_csv}")
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        process_pcap(str(pcap), str(csv_path))
+
+if __name__ == "__main__":
+    main()
--- a/extract_tars.sh
+++ b/extract_tars.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Usage: extract_all.sh SOURCE_DIR TARGET_DIR
+# For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR:
+#   1. Create TARGET_DIR/<name>/
+#   2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive.
+#   3. Otherwise, extract the archive into its own folder.
+
+set -euo pipefail
+
+if [[ $# -ne 2 ]]; then
+  echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2
+  exit 1
+fi
+
+src_dir="$1"
+dst_dir="$2"
+mkdir -p "$dst_dir"
+
+# Strip common extensions to recover the base name
+strip_ext() {
+  local n="$1"
+  n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar}
+  echo "$n"
+}
+
+shopt -s nullglob
+for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do
+  base=$(basename "$archive")
+  name=$(strip_ext "$base")
+  out_dir="$dst_dir/$name"
+  key_file="$out_dir/$name.pcap"
+
+  if [[ -f "$key_file" ]]; then
+    echo "Skipping $archive  —  $key_file already present"
+    continue
+  fi
+
+  echo "Extracting $archive into $out_dir"
+  mkdir -p "$out_dir"
+
+  case "$archive" in
+    *.tar)          tar -xf "$archive" -C "$out_dir" ;;
+    *.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;;
+    *.tar.bz2)      tar -xjf "$archive" -C "$out_dir" ;;
+    *.tar.xz)       tar -xJf "$archive" -C "$out_dir" ;;
+    *)              echo "Unknown type: $archive" ;;
+  esac
+done
+
+echo "All archives processed."
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ numpy
 pandas
 scikit-learn
 pydotplus
-matplotlib
+matplotlib
+scipy
--- a/run/decision_tree.py
+++ b/run/decision_tree.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""
+Train a decision tree, optionally “nudge” its split thresholds, and
+export the result as JSON.
+
+Usage examples
+--------------
+# plain training, no nudging
+python build_tree.py --input data/combined/data.csv --output tree.json
+
+# nudge every internal threshold, keeping only the top-2 bits
+python build_tree.py --input data/combined/data.csv --output tree.json \
+                     --nudge --bits 2
+"""
+import argparse
+import copy
+import json
+import math
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from sklearn.tree import DecisionTreeClassifier, _tree
+
+# ----------------------------------------------------------------------
+# 1. command-line arguments
+# ----------------------------------------------------------------------
+parser = argparse.ArgumentParser()
+parser.add_argument("--input",  "-i", help="CSV file with protocol,src,dst,label", default="../data/combined/data.csv")
+parser.add_argument("--output", "-o", help="Path for the exported JSON tree", default="tree.json")
+parser.add_argument("--depth",  "-d", type=int, default=5,
+                    help="Max depth of the decision tree (default: 5)")
+parser.add_argument("--nudge",  action="store_true",
+                    help="Enable threshold nudging")
+parser.add_argument("--bits",   type=int, default=2,
+                    help="Number of bits to keep when nudging (default: 2)")
+args = parser.parse_args()
+
+# ----------------------------------------------------------------------
+# 2. helper functions
+# ----------------------------------------------------------------------
+def nudge_threshold_max_n_bits(threshold: float, n_bits: int) -> int:
+    """Remove n bits from each"""
+    threshold = math.floor(threshold)
+    if n_bits == 0:
+        return threshold
+    
+    mask = pow(2, 32) - 1 ^ ((1 << n_bits) - 1)
+    nudged_value = threshold & mask
+    if threshold & (1 << (n_bits - 1)):
+        nudged_value += (1 << (n_bits))
+            
+    return nudged_value
+
+def apply_nudging(tree: _tree.Tree, node_idx: int, n_bits: int) -> None:
+    """Post-order traversal that nudges every internal node’s threshold."""
+    flag = False
+    if tree.children_left[node_idx] != -1:
+        apply_nudging(tree, tree.children_left[node_idx], n_bits)
+        flag = True
+    if tree.children_right[node_idx] != -1:
+        apply_nudging(tree, tree.children_right[node_idx], n_bits)
+        flag = True
+    if flag:    # internal node
+        tree.threshold[node_idx] = nudge_threshold_max_n_bits(
+            tree.threshold[node_idx], n_bits
+        )
+
+# output the tree
+def get_lineage(tree, feature_names):
+    data = {"features": {}, "paths": [], "classes": list(tree.classes_)}
+
+    thresholds = tree.tree_.threshold
+    features   = [feature_names[i] for i in tree.tree_.feature]
+    left       = tree.tree_.children_left
+    right      = tree.tree_.children_right
+    value      = tree.tree_.value
+
+    # -------- helper to climb up from a leaf to the root -----------
+    def recurse(left, right, child, lineage=None):
+        if lineage is None:
+            lineage = [child]          # leaf marker (an int)
+        if child in left:
+            parent = np.where(left == child)[0].item()
+            split  = "l"
+        elif child in right:
+            parent = np.where(right == child)[0].item()
+            split  = "r"
+        else:                          # should never happen
+            return lineage
+
+        lineage.append((parent, split, thresholds[parent], features[parent]))
+        if parent == 0:
+            return list(reversed(lineage))
+        return recurse(left, right, parent, lineage)
+
+    leaf_ids = np.where(left == -1)[0]             # indices of all leaves
+    for path_id, leaf in enumerate(leaf_ids):
+        clause = []
+
+        for node in recurse(left, right, leaf):
+            if not isinstance(node, tuple):        # skip the leaf marker
+                continue
+
+            direction, threshold, feature = node[1], node[2], node[3]
+            if direction == "l":
+                clause.append(
+                    {"feature": feature, "operation": "<=", "value": threshold}
+                )
+            else:
+                clause.append(
+                    {"feature": feature, "operation": ">",  "value": threshold}
+                )
+
+        class_idx = int(np.argmax(value[leaf][0]))  # use the leaf itself
+        data["paths"].append(
+            {"conditions": clause, "classification": class_idx, "id": path_id}
+        )
+
+    # collect all thresholds per feature
+    for i, feat in enumerate(features):
+        if tree.tree_.feature[i] != _tree.TREE_UNDEFINED:
+            data["features"].setdefault(feat, []).append(thresholds[i])
+
+    return data
+
+
+class SetEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, set):
+            return list(obj)
+        return super().default(obj)
+
+# ----------------------------------------------------------------------
+# 3. load data
+# ----------------------------------------------------------------------
+df = pd.read_csv(args.input)
+X = df.iloc[:, :3].to_numpy()
+Y = df.iloc[:, 3].to_numpy()
+
+print(f"dataset size: {len(X)}")
+
+# ----------------------------------------------------------------------
+# 4. train the tree
+# ----------------------------------------------------------------------
+dt = DecisionTreeClassifier(max_depth=args.depth)
+dt.fit(X, Y)
+print("train accuracy (before nudging):",
+      accuracy_score(Y, dt.predict(X)))
+
+if args.nudge:
+    nudged_tree = copy.deepcopy(dt.tree_)
+    apply_nudging(nudged_tree, 0, args.bits)
+    dt.tree_ = nudged_tree
+    print(f"nudging enabled, removed bottom {args.bits} bit(s) per threshold")
+
+    print("train accuracy (after  nudging):",
+        accuracy_score(Y, dt.predict(X)))
+
+# ----------------------------------------------------------------------
+# 5. export
+# ----------------------------------------------------------------------
+lineage = get_lineage(dt, df.columns[:3])
+
+output_path = Path(args.output)
+output_path.write_text(json.dumps(lineage, indent=4, cls=SetEncoder))
+print(f"Wrote tree to {output_path.resolve()}")
--- a/run/print.py
+++ b/run/print.py
@@ -0,0 +1,7 @@
+import json
+from pathlib import Path
+
+for file in Path("results/compressed_tree/").glob("*.json"):
+    with open(file, "r") as f:
+        s = json.load(f)
+        print(max(s["paths"])+1)
--- a/run/rmt.bat
+++ b/run/rmt.bat
@@ -0,0 +1,36 @@
+@echo off
+REM -------------------------------------------------------------
+REM Batch-script to evaluate all compressed trees with every mode
+REM -------------------------------------------------------------
+setlocal EnableDelayedExpansion
+
+REM --- where the trees live and where to store results ----------
+set TREEDIR=results\compressed_tree
+set OUTDIR=results\rmt
+
+REM --- python executable (adjust if needed) ---------------------
+set PY=python
+
+REM --- which modes to run --------------------------------------
+set MODELIST=naive priority
+REM -------------------------------------------------------------
+
+if not exist "%OUTDIR%" mkdir "%OUTDIR%"
+
+for %%F in ("%TREEDIR%\*.json") do (
+    REM strip path → get file name without extension
+    set BASE=%%~nF
+
+    for %%M in (%MODELIST%) do (
+        echo Processing %%~nxF with mode %%M
+
+        "%PY%" tree_to_rmt.py ^
+            --mode %%M ^
+            --input "%%F" ^
+            --output "%OUTDIR%\!BASE!_%%M.json"
+
+    )
+)
+
+echo All runs complete.
+pause
--- a/run/rmt.txt
+++ b/run/rmt.txt
@@ -0,0 +1,362 @@
+Processing compressed_tree_d10_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d10_b0_naive.json
+TCAM bits: 30336
+RAM bits:  6888
+Processing compressed_tree_d10_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d10_b0_priority.json
+TCAM bits: 26648
+RAM bits:  6888
+Processing compressed_tree_d10_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d10_b1_naive.json
+TCAM bits: 29936
+RAM bits:  6531
+Processing compressed_tree_d10_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d10_b1_priority.json
+TCAM bits: 27120
+RAM bits:  6531
+Processing compressed_tree_d10_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d10_b3_naive.json
+TCAM bits: 21712
+RAM bits:  5649
+Processing compressed_tree_d10_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d10_b3_priority.json
+TCAM bits: 20048
+RAM bits:  5649
+Processing compressed_tree_d11_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d11_b0_naive.json
+TCAM bits: 41248
+RAM bits:  10332
+Processing compressed_tree_d11_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d11_b0_priority.json
+TCAM bits: 37592
+RAM bits:  10332
+Processing compressed_tree_d11_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d11_b1_naive.json
+TCAM bits: 41072
+RAM bits:  9744
+Processing compressed_tree_d11_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d11_b1_priority.json
+TCAM bits: 38256
+RAM bits:  9744
+Processing compressed_tree_d11_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d11_b3_naive.json
+TCAM bits: 28464
+RAM bits:  8190
+Processing compressed_tree_d11_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d11_b3_priority.json
+TCAM bits: 26928
+RAM bits:  8190
+Processing compressed_tree_d12_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d12_b0_naive.json
+TCAM bits: 55680
+RAM bits:  15393
+Processing compressed_tree_d12_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d12_b0_priority.json
+TCAM bits: 51592
+RAM bits:  15393
+Processing compressed_tree_d12_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d12_b1_naive.json
+TCAM bits: 54240
+RAM bits:  14175
+Processing compressed_tree_d12_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d12_b1_priority.json
+TCAM bits: 51200
+RAM bits:  14175
+Processing compressed_tree_d12_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d12_b3_naive.json
+TCAM bits: 36048
+RAM bits:  11361
+Processing compressed_tree_d12_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d12_b3_priority.json
+TCAM bits: 34416
+RAM bits:  11361
+Processing compressed_tree_d13_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d13_b0_naive.json
+TCAM bits: 73152
+RAM bits:  22680
+Processing compressed_tree_d13_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d13_b0_priority.json
+TCAM bits: 69096
+RAM bits:  22680
+Processing compressed_tree_d13_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d13_b1_naive.json
+TCAM bits: 71024
+RAM bits:  20643
+Processing compressed_tree_d13_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d13_b1_priority.json
+TCAM bits: 68160
+RAM bits:  20643
+Processing compressed_tree_d13_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d13_b3_naive.json
+TCAM bits: 45152
+RAM bits:  16002
+Processing compressed_tree_d13_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d13_b3_priority.json
+TCAM bits: 43600
+RAM bits:  16002
+Processing compressed_tree_d14_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d14_b0_naive.json
+TCAM bits: 95760
+RAM bits:  33012
+Processing compressed_tree_d14_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d14_b0_priority.json
+TCAM bits: 91656
+RAM bits:  33012
+Processing compressed_tree_d14_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d14_b1_naive.json
+TCAM bits: 93520
+RAM bits:  29862
+Processing compressed_tree_d14_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d14_b1_priority.json
+TCAM bits: 90544
+RAM bits:  29862
+Processing compressed_tree_d14_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d14_b3_naive.json
+TCAM bits: 56144
+RAM bits:  21819
+Processing compressed_tree_d14_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d14_b3_priority.json
+TCAM bits: 54544
+RAM bits:  21819
+Processing compressed_tree_d15_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d15_b0_naive.json
+TCAM bits: 122496
+RAM bits:  46662
+Processing compressed_tree_d15_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d15_b0_priority.json
+TCAM bits: 118792
+RAM bits:  46662
+Processing compressed_tree_d15_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d15_b1_naive.json
+TCAM bits: 118640
+RAM bits:  41349
+Processing compressed_tree_d15_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d15_b1_priority.json
+TCAM bits: 115984
+RAM bits:  41349
+Processing compressed_tree_d15_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d15_b3_naive.json
+TCAM bits: 68928
+RAM bits:  28875
+Processing compressed_tree_d15_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d15_b3_priority.json
+TCAM bits: 67328
+RAM bits:  28875
+Processing compressed_tree_d1_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d1_b0_naive.json
+TCAM bits: 256
+RAM bits:  42
+Processing compressed_tree_d1_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d1_b0_priority.json
+TCAM bits: 128
+RAM bits:  42
+Processing compressed_tree_d1_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d1_b1_naive.json
+TCAM bits: 256
+RAM bits:  42
+Processing compressed_tree_d1_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d1_b1_priority.json
+TCAM bits: 144
+RAM bits:  42
+Processing compressed_tree_d1_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d1_b3_naive.json
+TCAM bits: 240
+RAM bits:  42
+Processing compressed_tree_d1_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d1_b3_priority.json
+TCAM bits: 128
+RAM bits:  42
+Processing compressed_tree_d2_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d2_b0_naive.json
+TCAM bits: 592
+RAM bits:  105
+Processing compressed_tree_d2_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d2_b0_priority.json
+TCAM bits: 288
+RAM bits:  105
+Processing compressed_tree_d2_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d2_b1_naive.json
+TCAM bits: 592
+RAM bits:  105
+Processing compressed_tree_d2_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d2_b1_priority.json
+TCAM bits: 320
+RAM bits:  105
+Processing compressed_tree_d2_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d2_b3_naive.json
+TCAM bits: 544
+RAM bits:  105
+Processing compressed_tree_d2_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d2_b3_priority.json
+TCAM bits: 288
+RAM bits:  105
+Processing compressed_tree_d3_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d3_b0_naive.json
+TCAM bits: 1120
+RAM bits:  210
+Processing compressed_tree_d3_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d3_b0_priority.json
+TCAM bits: 640
+RAM bits:  210
+Processing compressed_tree_d3_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d3_b1_naive.json
+TCAM bits: 1120
+RAM bits:  210
+Processing compressed_tree_d3_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d3_b1_priority.json
+TCAM bits: 680
+RAM bits:  210
+Processing compressed_tree_d3_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d3_b3_naive.json
+TCAM bits: 944
+RAM bits:  210
+Processing compressed_tree_d3_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d3_b3_priority.json
+TCAM bits: 576
+RAM bits:  210
+Processing compressed_tree_d4_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d4_b0_naive.json
+TCAM bits: 1880
+RAM bits:  357
+Processing compressed_tree_d4_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d4_b0_priority.json
+TCAM bits: 1128
+RAM bits:  357
+Processing compressed_tree_d4_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d4_b1_naive.json
+TCAM bits: 1880
+RAM bits:  357
+Processing compressed_tree_d4_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d4_b1_priority.json
+TCAM bits: 1208
+RAM bits:  357
+Processing compressed_tree_d4_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d4_b3_naive.json
+TCAM bits: 1632
+RAM bits:  336
+Processing compressed_tree_d4_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d4_b3_priority.json
+TCAM bits: 1024
+RAM bits:  336
+Processing compressed_tree_d5_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d5_b0_naive.json
+TCAM bits: 3608
+RAM bits:  609
+Processing compressed_tree_d5_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d5_b0_priority.json
+TCAM bits: 2200
+RAM bits:  609
+Processing compressed_tree_d5_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d5_b1_naive.json
+TCAM bits: 3608
+RAM bits:  609
+Processing compressed_tree_d5_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d5_b1_priority.json
+TCAM bits: 2376
+RAM bits:  609
+Processing compressed_tree_d5_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d5_b3_naive.json
+TCAM bits: 2704
+RAM bits:  546
+Processing compressed_tree_d5_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d5_b3_priority.json
+TCAM bits: 1824
+RAM bits:  546
+Processing compressed_tree_d6_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d6_b0_naive.json
+TCAM bits: 6440
+RAM bits:  1134
+Processing compressed_tree_d6_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d6_b0_priority.json
+TCAM bits: 4512
+RAM bits:  1134
+Processing compressed_tree_d6_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d6_b1_naive.json
+TCAM bits: 6440
+RAM bits:  1134
+Processing compressed_tree_d6_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d6_b1_priority.json
+TCAM bits: 4776
+RAM bits:  1134
+Processing compressed_tree_d6_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d6_b3_naive.json
+TCAM bits: 4832
+RAM bits:  1008
+Processing compressed_tree_d6_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d6_b3_priority.json
+TCAM bits: 3648
+RAM bits:  1008
+Processing compressed_tree_d7_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d7_b0_naive.json
+TCAM bits: 10344
+RAM bits:  1848
+Processing compressed_tree_d7_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d7_b0_priority.json
+TCAM bits: 7808
+RAM bits:  1848
+Processing compressed_tree_d7_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d7_b1_naive.json
+TCAM bits: 10312
+RAM bits:  1806
+Processing compressed_tree_d7_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d7_b1_priority.json
+TCAM bits: 8136
+RAM bits:  1806
+Processing compressed_tree_d7_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d7_b3_naive.json
+TCAM bits: 7760
+RAM bits:  1596
+Processing compressed_tree_d7_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d7_b3_priority.json
+TCAM bits: 6352
+RAM bits:  1596
+Processing compressed_tree_d8_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d8_b0_naive.json
+TCAM bits: 15672
+RAM bits:  3003
+Processing compressed_tree_d8_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d8_b0_priority.json
+TCAM bits: 12640
+RAM bits:  3003
+Processing compressed_tree_d8_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d8_b1_naive.json
+TCAM bits: 15576
+RAM bits:  2919
+Processing compressed_tree_d8_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d8_b1_priority.json
+TCAM bits: 13160
+RAM bits:  2919
+Processing compressed_tree_d8_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d8_b3_naive.json
+TCAM bits: 11504
+RAM bits:  2625
+Processing compressed_tree_d8_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d8_b3_priority.json
+TCAM bits: 10016
+RAM bits:  2625
+Processing compressed_tree_d9_b0.json with mode naive
+Output written to results\rmt\compressed_tree_d9_b0_naive.json
+TCAM bits: 22640
+RAM bits:  4662
+Processing compressed_tree_d9_b0.json with mode priority
+Output written to results\rmt\compressed_tree_d9_b0_priority.json
+TCAM bits: 18936
+RAM bits:  4662
+Processing compressed_tree_d9_b1.json with mode naive
+Output written to results\rmt\compressed_tree_d9_b1_naive.json
+TCAM bits: 22784
+RAM bits:  4557
+Processing compressed_tree_d9_b1.json with mode priority
+Output written to results\rmt\compressed_tree_d9_b1_priority.json
+TCAM bits: 19872
+RAM bits:  4557
+Processing compressed_tree_d9_b3.json with mode naive
+Output written to results\rmt\compressed_tree_d9_b3_naive.json
+TCAM bits: 16560
+RAM bits:  3948
+Processing compressed_tree_d9_b3.json with mode priority
+Output written to results\rmt\compressed_tree_d9_b3_priority.json
+TCAM bits: 14880
+RAM bits:  3948
+All runs complete.
+Press any key to continue . . . 
--- a/run/run.bat
+++ b/run/run.bat
@@ -0,0 +1,24 @@
+@echo off
+REM --- settings --------------------------------------------------------
+set INPUT=..\data\combined\data.csv
+set OUTDIR=results\tree
+set DEPTH_LIST=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+set BITS_LIST=0 1 3
+set PY=python
+REM ---------------------------------------------------------------------
+
+if not exist "%OUTDIR%" mkdir "%OUTDIR%"
+
+for %%D in (%DEPTH_LIST%) do (
+    for %%B in (%BITS_LIST%) do (
+        echo Running depth=%%D bits=%%B
+        %PY% decision_tree.py ^
+            --input "%INPUT%" ^
+            --output "%OUTDIR%\tree_d%%D_b%%B.json" ^
+            --depth %%D ^
+            --nudge --bits %%B
+    )
+)
+
+echo All runs complete
+pause
--- a/run/run.txt
+++ b/run/run.txt
@@ -0,0 +1,272 @@
+Running depth=1 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.6249802762830571
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.6249802762830571
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b0.json
+Running depth=1 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.6249802762830571
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.6249802762830571
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b1.json
+Running depth=1 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.6249802762830571
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.6249802762830571
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b3.json
+Running depth=2 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.6329657127591488
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.6329657127591488
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b0.json
+Running depth=2 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.6329657127591488
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.632965582569598
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b1.json
+Running depth=2 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.6329657127591488
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.632991490290203
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b3.json
+Running depth=3 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.6770542739406867
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.6770542739406867
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b0.json
+Running depth=3 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.6770542739406867
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.6770412549856089
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b1.json
+Running depth=3 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.6770542739406867
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.6785083610333301
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b3.json
+Running depth=4 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.7785798611346175
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.7785798611346175
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b0.json
+Running depth=4 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.7785798611346175
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.7762147075656273
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b1.json
+Running depth=4 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.7785798611346175
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7764365505601536
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b3.json
+Running depth=5 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.8410252791654538
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.8410252791654538
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b0.json
+Running depth=5 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.8410252791654538
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.834092425207405
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b1.json
+Running depth=5 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.8410252791654538
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.772544924508287
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b3.json
+Running depth=6 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.8646269522574087
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.8646269522574087
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b0.json
+Running depth=6 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.8646269522574087
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.8576925360247506
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b1.json
+Running depth=6 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.8646269522574087
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.794651761178205
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b3.json
+Running depth=7 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.8806056365826389
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.8806056365826389
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b0.json
+Running depth=7 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.8806056365826389
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.8736095105029118
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b1.json
+Running depth=7 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.8806056365826389
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7695685309983924
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b3.json
+Running depth=8 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.8930218140403702
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.8930218140403702
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b0.json
+Running depth=8 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.8930218140403702
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.8853817704424934
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b1.json
+Running depth=8 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.8930218140403702
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7773965683075931
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b3.json
+Running depth=9 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9065990219119429
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9065990219119429
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b0.json
+Running depth=9 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9065990219119429
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.8971600191014109
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b1.json
+Running depth=9 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9065990219119429
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7901483744272311
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b3.json
+Running depth=10 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9131070673658019
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9131070673658019
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b0.json
+Running depth=10 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9131070673658019
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.9012124292484887
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b1.json
+Running depth=10 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9131070673658019
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7823837394292594
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b3.json
+Running depth=11 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9167131877328115
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9167131877328115
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b0.json
+Running depth=11 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9167131877328115
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.9033505322409215
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b1.json
+Running depth=11 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9167131877328115
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7834850128392935
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b3.json
+Running depth=12 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9190772997853955
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9190772997853955
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b0.json
+Running depth=12 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9190772997853955
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.9050692946902973
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b1.json
+Running depth=12 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9190772997853955
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7733082258445005
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b3.json
+Running depth=13 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9210431620021486
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9210431620021486
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b0.json
+Running depth=13 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9210431620021486
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.9069113466442602
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b1.json
+Running depth=13 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9210431620021486
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7656775558942799
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b3.json
+Running depth=14 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9232170671210456
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9232170671210456
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b0.json
+Running depth=14 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9232169369314948
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.9071005120615411
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b1.json
+Running depth=14 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9232170671210456
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.7649352150757417
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b3.json
+Running depth=15 bits=0
+dataset size: 7681108
+train accuracy (before nudging): 0.9249752770043072
+nudging enabled, removed bottom 0 bit(s) per threshold
+train accuracy (after  nudging): 0.9249752770043072
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b0.json
+Running depth=15 bits=1
+dataset size: 7681108
+train accuracy (before nudging): 0.9249752770043072
+nudging enabled, removed bottom 1 bit(s) per threshold
+train accuracy (after  nudging): 0.908089692268355
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b1.json
+Running depth=15 bits=3
+dataset size: 7681108
+train accuracy (before nudging): 0.9249752770043072
+nudging enabled, removed bottom 3 bit(s) per threshold
+train accuracy (after  nudging): 0.762985496363285
+Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b3.json
+All runs complete
+Press any key to continue . . . 
--- a/run/tree_compress.py
+++ b/run/tree_compress.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Batch‑compress decision‑tree JSON files.
+
+This script preserves the original logic but loops over every *.json file
+in results/tree and drops a corresponding compressed file in
+results/compressed_tree.
+
+Example:
+    $ python compress_trees_batch.py
+"""
+
+from __future__ import annotations
+
+import json
+import math
+import os
+from collections import defaultdict
+from pathlib import Path
+
+INPUT_DIR = Path("results/tree")
+OUTPUT_DIR = Path("results/compressed_tree")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+class SetEncoder(json.JSONEncoder):
+    def default(self, obj):  # type: ignore[override]
+        if isinstance(obj, set):
+            return list(obj)
+        return super().default(obj)
+
+
+# helper function given a range and value x returns if x is in the range
+
+def is_in_range(x: int, lower: int | None, upper: int | None) -> bool:  # noqa: N803
+    if lower is None and upper is None:
+        return True
+    if lower is None:
+        return x <= upper  # type: ignore[operator]
+    if upper is None:
+        return x > lower
+    return x <= upper and x > lower  # type: ignore[operator]
+
+
+for tree_path in INPUT_DIR.glob("*.json"):
+    with tree_path.open() as f:
+        tree = json.load(f)
+
+    paths = tree["paths"]
+
+    # First cleanup the tree by rounding the decision points to integer values
+    path_ids: set[int] = set()
+    path_classes = tree["classes"]
+
+    # assign ids and round thresholds
+    for idx, path in enumerate(paths):
+        path["id"] = idx
+        path_ids.add(idx)
+        for condition in path["conditions"]:
+            operation = condition["operation"]
+            if operation == "<=":
+                condition["value"] = math.floor(condition["value"])
+            else:
+                condition["value"] = math.floor(condition["value"])
+
+    # Find all breakpoints for each feature and create a set of disjoint ranges
+    breakpoints: dict[str, list[int]] = defaultdict(set)  # type: ignore[assignment]
+    for path in paths:
+        for condition in path["conditions"]:
+            feature = condition["feature"]
+            value = condition["value"]
+            breakpoints[feature].add(value)
+
+    # sort breakpoint lists
+    for feature in breakpoints:
+        points = list(breakpoints[feature])
+        points.sort()
+        breakpoints[feature] = points  # type: ignore[assignment]
+
+    # collapse all paths to ranges for each feature
+    for path in paths:
+        compressed: dict[str, dict[str, int | None]] = {}
+        for feature in breakpoints:
+            compressed[feature] = {"min": None, "max": None}
+
+        for condition in path["conditions"]:
+            feature = condition["feature"]
+            operation = condition["operation"]
+            value = condition["value"]
+            if operation == "<=" and compressed[feature]["max"] is None:
+                compressed[feature]["max"] = value
+            elif operation == ">" and compressed[feature]["min"] is None:
+                compressed[feature]["min"] = value
+            elif operation == "<=" and value < compressed[feature]["max"]:  # type: ignore[operator]
+                compressed[feature]["max"] = value
+            elif operation == ">" and value > compressed[feature]["min"]:  # type: ignore[operator]
+                compressed[feature]["min"] = value
+
+        path["compressed"] = compressed
+
+    # create buckets for each feature, where each is a list of sets
+    buckets_id: dict[str, list[set[int]]] = {}
+    buckets_class: dict[str, list[set[str]]] = {}
+    for feature in breakpoints:
+        num_points = len(breakpoints[feature])
+        buckets_id[feature] = [set() for _ in range(num_points + 1)]
+        buckets_class[feature] = [set() for _ in range(num_points + 1)]
+
+    # fill buckets
+    for path in paths:
+        for feature_name, feature in path["compressed"].items():
+            lower = feature["min"]
+            upper = feature["max"]
+            pid = path["id"]
+            cls = path["classification"]
+
+            for idx, bp in enumerate(breakpoints[feature_name]):
+                if is_in_range(bp, lower, upper):
+                    buckets_id[feature_name][idx].add(pid)
+                    buckets_class[feature_name][idx].add(cls)
+            # last bucket (> last breakpoint)
+            if is_in_range(bp + 1, lower, upper):
+                buckets_id[feature_name][-1].add(pid)
+                buckets_class[feature_name][-1].add(cls)
+
+    # combine breakpoints and buckets to one representation
+    compressed_layers: dict[str, list[dict[str, object]]] = defaultdict(list)
+    for feature_name in buckets_id:
+        lower = None
+        upper = breakpoints[feature_name][0]
+        compressed_layers[feature_name].append(
+            {
+                "min": lower,
+                "max": upper,
+                "paths": buckets_id[feature_name][0],
+                "classes": buckets_class[feature_name][0],
+            }
+        )
+        for i in range(1, len(buckets_id[feature_name]) - 1):
+            lower = breakpoints[feature_name][i - 1]
+            upper = breakpoints[feature_name][i]
+            compressed_layers[feature_name].append(
+                {
+                    "min": lower,
+                    "max": upper,
+                    "paths": buckets_id[feature_name][i],
+                    "classes": buckets_class[feature_name][i],
+                }
+            )
+        lower = breakpoints[feature_name][-1]
+        upper = None
+        compressed_layers[feature_name].append(
+            {
+                "min": lower,
+                "max": upper,
+                "paths": buckets_id[feature_name][-1],
+                "classes": buckets_class[feature_name][-1],
+            }
+        )
+
+    path_to_class = {path["id"]: path["classification"] for path in paths}
+
+    compressed_tree = {
+        "paths": list(path_ids),
+        "classes": path_classes,
+        "layers": compressed_layers,
+        "path_to_class": path_to_class,
+    }
+
+    out_path = OUTPUT_DIR / tree_path.name.replace("tree", "compressed_tree")
+    with out_path.open("w") as f_out:
+        json.dump(compressed_tree, f_out, indent=4, cls=SetEncoder)
+
+    # print(f"Wrote {out_path.relative_to(Path.cwd())}")
--- a/run/tree_to_rmt.py
+++ b/run/tree_to_rmt.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""Range‑to‑Prefix evaluation tool
+
+This script keeps the original logic intact while letting you choose
+which expansion strategy to run via a command‑line flag.
+
+Example:
+    $ python rmt_selectable.py --mode naive
+    $ python rmt_selectable.py --mode priority --input mytree.json --output result.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Static configuration
+# ---------------------------------------------------------------------------
+field_width = {
+    "src": 16,
+    "dst": 16,
+    "protocol": 8,
+}
+
+# ---------------------------------------------------------------------------
+# Helper routines (unchanged)
+# ---------------------------------------------------------------------------
+
+def int_to_bin(i, width):
+    return bin(i)[2:].zfill(width)
+
+
+def increment_dc(pfx):
+    idx = pfx.find("*")
+    if idx == -1:
+        idx = len(pfx)
+    idx -= 1
+    return pfx[:idx] + "*" + pfx[idx + 1 :]
+
+
+def can_merge(pfx_a, pfx_b):
+    pfx_a = pfx_a.replace("*", "")
+    pfx_b = pfx_b.replace("*", "")
+    return pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1]
+
+
+def merge(pfx_a, prefixes):
+    pfx_a = increment_dc(pfx_a)
+    prefixes[-1] = pfx_a
+
+    for i in range(len(prefixes) - 2, -1, -1):
+        if can_merge(prefixes[i], prefixes[i + 1]):
+            prefixes.pop()
+            pfx = increment_dc(prefixes[i])
+            prefixes[i] = pfx
+
+
+def convert_range(lower, upper, width):
+    prefixes = []
+    prefix = int_to_bin(lower, width)
+    prefixes.append(prefix)
+    norm_upper = min(upper, 2 ** width - 1)
+    for i in range(lower + 1, norm_upper + 1):
+        prefix = int_to_bin(i, width)
+        if can_merge(prefix, prefixes[-1]):
+            merge(prefix, prefixes)
+        else:
+            prefixes.append(prefix)
+    return prefixes
+
+# ---------------------------------------------------------------------------
+# RMT construction strategies (logic preserved)
+# ---------------------------------------------------------------------------
+
+def worst_case_rmt(tree):
+    rmt = []
+    step = 0
+
+    tcam_bits = 0
+    ram_bits = 0
+
+    for layer in layers:
+        num_ranges = len(layers[layer])
+        # assume that each range requires all of 2*k prefixes when performing prefix expansion
+        # therefore there are 2*k * R for R ranges and width k
+        num_prefixes = 2 * field_width[layer] * num_ranges
+        prefix_width = field_width[layer]
+
+        tcam = {
+            "id": f"{layer}_range",
+            "step": step,
+            "match": "ternary",
+            "entries": num_prefixes,
+            "key_size": prefix_width,
+        }
+        tcam_bits += num_prefixes * prefix_width
+
+        # assume basic pointer reuse for metadata storage
+        ram = {
+            "id": f"{layer}_meta",
+            "step": step,
+            "match": "exact",
+            "method": "index",
+            "key_size": math.ceil(math.log2(num_ranges)),
+            "data_size": len(classes),
+        }
+        ram_bits += num_ranges * len(classes)
+
+        rmt.append(tcam)
+        rmt.append(ram)
+
+        step += 1
+
+    return rmt, tcam_bits, ram_bits
+
+
+def naive_rmt(tree):
+    rmt = []
+    step = 0
+
+    tcam_bits = 0
+    ram_bits = 0
+
+    for layer in layers:
+        num_prefixes = 0
+        prefix_width = field_width[layer]
+        # for each range in the layer, convert the ranges to prefixes using naive range expansion
+        for r in layers[layer]:
+            if r["min"] is None:
+                r["min"] = 0
+            elif r["max"] is None:
+                r["max"] = 2 ** prefix_width
+            prefixes = convert_range(r["min"], r["max"], prefix_width)
+            r["prefixes"] = prefixes
+            num_prefixes += len(prefixes)
+            tcam_bits += len(prefixes) * prefix_width
+
+        tcam = {
+            "id": f"{layer}_range",
+            "step": step,
+            "match": "ternary",
+            "entries": num_prefixes,
+            "key_size": prefix_width,
+            "ranges": layers[layer],
+        }
+
+        num_ranges = len(layers[layer])
+        # assume no pointer reuse for metadata storage
+        ram = {
+            "id": f"{layer}_meta",
+            "step": step,
+            "match": "exact",
+            "method": "index",
+            "key_size": math.ceil(math.log2(num_ranges)),
+            "data_size": len(classes),
+        }
+        ram_bits += num_ranges * len(classes)
+
+        rmt.append(tcam)
+        rmt.append(ram)
+
+        step += 1
+
+    return rmt, tcam_bits, ram_bits
+
+
+def priority_aware(tree):
+    rmt = []
+    step = 0
+
+    tcam_bits = 0
+    ram_bits = 0
+
+    for layer in layers:
+        num_prefixes = 0
+        prefix_width = field_width[layer]
+        # for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0
+        # then check which set of prefixes would be better
+        # we will assume the ranges are already disjoint and in the correct order
+        for r in layers[layer]:
+            if r["min"] is None:
+                r["min"] = 0
+            elif r["max"] is None:
+                r["max"] = 2 ** prefix_width
+            regular_prefixes = convert_range(r["min"], r["max"], prefix_width)
+            zero_start_prefixes = convert_range(0, r["max"], prefix_width)
+
+            if len(regular_prefixes) <= len(zero_start_prefixes):
+                pfx_type = "exact"
+                prefixes = regular_prefixes
+            else:
+                pfx_type = "zero"
+                prefixes = zero_start_prefixes
+
+            r["prefixes"] = prefixes
+            r["prefix_type"] = pfx_type
+            num_prefixes += len(prefixes)
+            tcam_bits += len(prefixes) * prefix_width
+
+        tcam = {
+            "id": f"{layer}_range",
+            "step": step,
+            "match": "ternary",
+            "entries": num_prefixes,
+            "key_size": prefix_width,
+            "ranges": layers[layer],
+        }
+
+        num_ranges = len(layers[layer])
+        # assume no pointer reuse for metadata storage
+        ram = {
+            "id": f"{layer}_meta",
+            "step": step,
+            "match": "exact",
+            "method": "index",
+            "key_size": math.ceil(math.log2(num_ranges)),
+            "data_size": len(classes),
+        }
+        ram_bits += num_ranges * len(classes)
+
+        rmt.append(tcam)
+        rmt.append(ram)
+
+        step += 1
+
+    return rmt, tcam_bits, ram_bits
+
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate RMT memory usage for different range‑to‑prefix strategies.")
+    parser.add_argument("--mode", choices=["worst", "naive", "priority"], default="worst", help="Strategy to use")
+    parser.add_argument("--input", default="compressed_tree.json", help="Input tree JSON file")
+    parser.add_argument("--output", default=None, help="Output RMT JSON file (defaults to <mode>_rmt.json)")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    # Keep the original variable names so the functions stay unchanged
+    global layers, classes
+
+    try:
+        with open(args.input) as f:
+            tree = json.load(f)
+    except FileNotFoundError:
+        sys.exit(f"Input file '{args.input}' not found.")
+
+    layers = tree["layers"]
+    classes = tree["classes"]
+
+    if args.mode == "worst":
+        rmt, tcam_bits, ram_bits = worst_case_rmt(tree)
+        default_out = "worst_case_rmt.json"
+    elif args.mode == "naive":
+        rmt, tcam_bits, ram_bits = naive_rmt(tree)
+        default_out = "naive_rmt.json"
+    else:  # priority
+        rmt, tcam_bits, ram_bits = priority_aware(tree)
+        default_out = "priority_aware.json"
+
+    out_file = args.output or default_out
+
+    with open(out_file, "w") as f:
+        json.dump(rmt, f, indent=4)
+
+    #! command python3 ideal-rmt-simulator/sim.py {out_file}
+    print(f"Output written to {out_file}")
+    print(f"TCAM bits: {tcam_bits}")
+    print(f"RAM bits:  {ram_bits}")
+
+
+if __name__ == "__main__":
+    main()
--- a/sanity_check/csvdiff.py
+++ b/sanity_check/csvdiff.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+csvdiff.py file1.csv file2.csv
+Streams both files; prints the first differing line or
+‘No differences found’. Uses O(1) memory.
+"""
+
+import sys
+from itertools import zip_longest
+from pathlib import Path
+
+def open_checked(p: str):
+    print(p)
+    path = Path(p)
+    try:
+        return path.open("r", newline=""), path
+    except FileNotFoundError:
+        sys.exit(f"Error: {path} not found")
+
+def human(n: int) -> str:
+    return f"{n:,}"
+
+def main(a_path: str, b_path: str) -> None:
+    fa, a = open_checked(a_path)
+    fb, b = open_checked(b_path)
+
+    with fa, fb:
+        for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1):
+            if ra != rb:
+                print(f"Files differ at line {human(idx)}")
+                if ra is None:
+                    print(f"{a} ended early")
+                elif rb is None:
+                    print(f"{b} ended early")
+                else:
+                    print(f"{a}: {ra.rstrip()}")
+                    print(f"{b}: {rb.rstrip()}")
+                return
+    print("No differences found")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        sys.exit("Usage: csvdiff.py file1.csv file2.csv")
+    main(sys.argv[1], sys.argv[2])
--- a/sanity_check/data_visualization.ipynb
+++ b/sanity_check/data_visualization.ipynb
--- a/sanity_check/diversity_metrics.py
+++ b/sanity_check/diversity_metrics.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""diversity_metrics.py (fast version)
+
+Estimate how much diversity each CSV adds without building a giant in‑memory
+DataFrame.  Designed for IoT packet logs with millions of rows.
+
+Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back
+to pandas plain text).
+
+Usage
+-----
+python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000]
+
+Metrics
+-------
+ΔEntropy  : change in Shannon entropy of *classification* counts
+ΔGini     : change in Gini impurity of the same counts
+χ² p      : Pearson χ² p‑value old vs new classification counts
+Jaccard   : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new)
+KS src p  : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling)
+KS dst p  : Kolmogorov–Smirnov p‑value, dest‑port  dist (uses sampling)
+
+Speed tricks
+------------
+* No growing DataFrame; we keep Counters / sets / lists.
+* Ports for KS are *sampled* (default 50 k) to bound cost.
+* (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead.
+* pandas reads via **pyarrow** engine when available.
+"""
+
+import argparse
+from pathlib import Path
+from collections import Counter
+from typing import List, Set
+
+import numpy as np
+import pandas as pd
+from scipy.stats import chi2_contingency, ks_2samp, entropy
+
+try:
+    from tabulate import tabulate
+    _USE_TABULATE = True
+except ImportError:
+    _USE_TABULATE = False
+
+# -----------------------------------------------------------------------------
+# Helper metrics
+# -----------------------------------------------------------------------------
+
+def shannon(counts: Counter) -> float:
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    p = np.fromiter(counts.values(), dtype=float)
+    p /= total
+    return entropy(p, base=2)
+
+
+def gini(counts: Counter) -> float:
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    return 1.0 - sum((n / total) ** 2 for n in counts.values())
+
+
+def jaccard(a: Set[int], b: Set[int]) -> float:
+    if not a and not b:
+        return 1.0
+    return len(a & b) / len(a | b)
+
+# -----------------------------------------------------------------------------
+# Core analysis
+# -----------------------------------------------------------------------------
+
+def analyse(csv_files: List[Path], sample_size: int):
+    """Return list of dicts with diversity metrics for each added file."""
+
+    # cumulative state (no big DataFrame!)
+    class_counter: Counter = Counter()
+    pair_hashes: Set[int] = set()
+    src_list: List[int] = []
+    dst_list: List[int] = []
+
+    rows = []
+
+    for csv_path in csv_files:
+        df = pd.read_csv(
+            csv_path,
+            engine="pyarrow" if pd.__version__ >= "2" else "c",  # fast parse
+            usecols=["protocl", "src", "dst", "classfication"],
+            dtype={
+                "protocl": "uint16",
+                "protocol": "uint16",
+                "src": "uint16",
+                "dst": "uint16",
+            },
+        )
+        # normalise column names
+        df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True)
+
+        # snapshot previous state
+        prev_class = class_counter.copy()
+        prev_pairs = pair_hashes.copy()
+        prev_src = np.asarray(src_list, dtype=np.uint16)
+        prev_dst = np.asarray(dst_list, dtype=np.uint16)
+
+        # --- update cumulative structures ------------------------------------
+        class_counter.update(df["classification"].value_counts().to_dict())
+
+        # hash (src,dst) into 32‑bit int to save memory
+        pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \
+            df["dst"].to_numpy(dtype=np.uint32)
+
+
+        # extend port lists (keep small ints)
+        src_list.extend(df["src"].tolist())
+        dst_list.extend(df["dst"].tolist())
+
+        # --- metrics ----------------------------------------------------------
+        # χ² classification
+        chi_p = np.nan
+        if prev_class:
+            all_classes = list(set(prev_class) | set(df["classification"].unique()))
+            old = [prev_class.get(c, 0) for c in all_classes]
+            new = [df["classification"].value_counts().get(c, 0) for c in all_classes]
+            _, chi_p, _, _ = chi2_contingency([old, new])
+
+        # entropy & gini deltas
+        delta_entropy = shannon(class_counter) - shannon(prev_class)
+        delta_gini = gini(class_counter) - gini(prev_class)
+
+        # Jaccard on pair hashes
+        jc = jaccard(prev_pairs, pair_hashes)
+
+        # KS tests on sampled ports
+        ks_src_p = ks_dst_p = np.nan
+        if prev_src.size:
+            new_src = df["src"].to_numpy(dtype=np.uint16)
+            new_dst = df["dst"].to_numpy(dtype=np.uint16)
+            if prev_src.size > sample_size:
+                prev_src_sample = np.random.choice(prev_src, sample_size, replace=False)
+            else:
+                prev_src_sample = prev_src
+            if new_src.size > sample_size:
+                new_src_sample = np.random.choice(new_src, sample_size, replace=False)
+            else:
+                new_src_sample = new_src
+            if prev_dst.size > sample_size:
+                prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False)
+            else:
+                prev_dst_sample = prev_dst
+            if new_dst.size > sample_size:
+                new_dst_sample = np.random.choice(new_dst, sample_size, replace=False)
+            else:
+                new_dst_sample = new_dst
+
+            ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue
+            ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue
+
+        rows.append(
+            {
+                "File": csv_path.name,
+                "Rows": len(df),
+                "ΔEntropy": round(delta_entropy, 4),
+                "ΔGini": round(delta_gini, 4),
+                "χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA",
+                "Jaccard": round(jc, 3),
+                "KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA",
+                "KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA",
+            }
+        )
+    return rows
+
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+
+def main():
+    ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).")
+    ap.add_argument("csv_dir", help="Directory containing CSV files")
+    ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir")
+    ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)")
+    args = ap.parse_args()
+
+    root = Path(args.csv_dir)
+    pattern = "**/*.csv" if args.recursive else "*.csv"
+    csv_files = sorted(root.glob(pattern))
+    if not csv_files:
+        print("No CSV files found.")
+        return
+
+    table_rows = analyse(csv_files, args.sample)
+
+    if _USE_TABULATE:
+        print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f"))
+    else:
+        print(pd.DataFrame(table_rows).to_string(index=False))
+
+    print(
+        "\nLegend:\n  • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)"
+        "\n  • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info"
+        "\n  • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant."
+    )
+
+if __name__ == "__main__":
+    main()
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# Creates the directory layout:
+#   data/
+#     tar/
+#     pcap/
+#     processed/
+
+set -euo pipefail
+
+root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+mkdir -p "$root"/data/{tar,pcap,processed,combined}
+
+echo "Directory structure ready under $root/data/"
Author	SHA1	Message	Date
Jai Parera	c8a0b18abf	Run data and code	2025-06-13 16:11:56 -07:00
Arthur Lu	2ad40946d1	update treetormt	2025-06-12 19:53:46 +00:00
Arthur Lu	50075b1acc	Merge remote-tracking branch 'origin/decision-tree-nudge-boundaries'	2025-06-12 19:52:02 +00:00
Jai Parera	1585399c7d	Fixed loop ordering and path_to_class in JSON	2025-06-11 23:37:33 -07:00
Arthur Lu	8301998da3	temp fix for issue with metadata	2025-06-12 06:10:39 +00:00
Arthur Lu	3b2d6b3186	fix eval bug in parser	2025-06-12 05:37:48 +00:00
Jai Parera	24fc2ed6f7	Add support for combined datasets and analysis	2025-06-11 20:38:37 -07:00
Nathan Huey	fda251f051	First try boundary nudging	2025-06-11 16:55:23 -07:00
Arthur Lu	541538fcfe	update decision tree results	2025-06-11 22:37:38 +00:00
Arthur Lu	afc882a569	Merge pull request #5 from ltcptgeneral/Parser Fix range bug in TreeCompress, update parser to autofail non-singular…	2025-06-11 12:11:39 -07:00
Arthur Lu	6de3807fe2	fix range bug in TreeCompress, update parser to autofail non-singular classifications	2025-06-11 19:10:19 +00:00
Arthur Lu	fc16d3c586	Merge pull request #4 from ltcptgeneral/Parser Eval compressed tree accuracy	2025-06-11 11:28:38 -07:00
Arthur Lu	7bee40ecf9	restore TreeCompress and TreeToRMT from main	2025-06-11 18:28:18 +00:00
krishpatel	e811171a73	Implemented working compressed tree parser to get classification accuracy	2025-06-11 11:10:49 -07:00
Arthur Lu	61a451b82d	fix counting issue in ram bits	2025-06-11 04:47:35 +00:00
Arthur Lu	c73de36c70	replace classes with class string instead of index	2025-06-11 04:41:32 +00:00