mirror of
				https://github.com/ltcptgeneral/IdealRMT-DecisionTrees.git
				synced 2025-10-25 21:39:21 +00:00 
			
		
		
		
	Compare commits
	
		
			21 Commits
		
	
	
		
			23867747cd
			...
			jai_runs
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | c8a0b18abf | ||
| 2ad40946d1 | |||
| 50075b1acc | |||
|  | 1585399c7d | ||
| 8301998da3 | |||
| 3b2d6b3186 | |||
|  | 24fc2ed6f7 | ||
|  | fda251f051 | ||
| 541538fcfe | |||
|  | afc882a569 | ||
| 6de3807fe2 | |||
|  | fc16d3c586 | ||
| 7bee40ecf9 | |||
|  | e811171a73 | ||
| 61a451b82d | |||
| c73de36c70 | |||
| fadeab8a99 | |||
| c208037ae9 | |||
| ae3128f6e8 | |||
| 25e5a86a43 | |||
| d3fe6efd47 | 
							
								
								
									
										2
									
								
								.gitattributes
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.gitattributes
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | # force LF for any shell script | ||||||
|  | *.sh text eol=lf | ||||||
							
								
								
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,4 +1,6 @@ | |||||||
| data.* | data.* | ||||||
| __pycache__ | __pycache__ | ||||||
| tree.json | *.json | ||||||
| compressed_tree.json | data/* | ||||||
|  | .DS_Store | ||||||
|  | .ipynb_checkpoints/ | ||||||
|   | |||||||
							
								
								
									
										152
									
								
								CompressedTreeParser.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										152
									
								
								CompressedTreeParser.ipynb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,152 @@ | |||||||
|  | { | ||||||
|  |  "cells": [ | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 138, | ||||||
|  |    "id": "938dec51", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "import numpy as np\n", | ||||||
|  |     "import pandas as pd\n", | ||||||
|  |     "import argparse\n", | ||||||
|  |     "from sklearn.tree import DecisionTreeClassifier, plot_tree, _tree\n", | ||||||
|  |     "from sklearn.metrics import accuracy_score\n", | ||||||
|  |     "from sklearn.tree import export_graphviz\n", | ||||||
|  |     "import pydotplus\n", | ||||||
|  |     "from matplotlib import pyplot as plt\n", | ||||||
|  |     "from labels import mac_to_label\n", | ||||||
|  |     "import json\n", | ||||||
|  |     "import math" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 139, | ||||||
|  |    "id": "442624c7", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "Set1 = pd.read_csv('data/combined/data.csv').values.tolist()\n", | ||||||
|  |     "X = [i[0:3] for i in Set1]\n", | ||||||
|  |     "Y =[i[3] for i in Set1]" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 142, | ||||||
|  |    "id": "12ad454d", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "{'0': 20, '1': 20, '2': 9, '3': 20, '4': 0, '5': 13, '6': 20, '7': 0, '8': 12, '9': 4, '10': 20, '11': 4, '12': 1, '13': 16, '14': 20, '15': 2, '16': 20, '17': 0, '18': 20, '19': 20, '20': 20, '21': 20, '22': 20, '23': 1, '24': 2, '25': 20, '26': 13, '27': 11, '28': 20, '29': 20}\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "predict_Yt = []\n", | ||||||
|  |     "index=0\n", | ||||||
|  |     "\n", | ||||||
|  |     "with open('compressed_tree.json', 'r') as file:\n", | ||||||
|  |     "    data = json.load(file)\n", | ||||||
|  |     "    classes = data[\"classes\"]\n", | ||||||
|  |     "    for x in X:\n", | ||||||
|  |     "        counter = 0\n", | ||||||
|  |     "        class_set = []\n", | ||||||
|  |     "        paths_set = []\n", | ||||||
|  |     "        features = [\"protocol\", \"src\", \"dst\"]\n", | ||||||
|  |     "        for feature in features:\n", | ||||||
|  |     "            if feature in data[\"layers\"]:\n", | ||||||
|  |     "                for node in data['layers'][feature]:\n", | ||||||
|  |     "                    if node['min'] is None:\n", | ||||||
|  |     "                        if x[counter] <= node['max']:\n", | ||||||
|  |     "                            class_set.append(node['classes'])\n", | ||||||
|  |     "                            paths_set.append(node[\"paths\"])\n", | ||||||
|  |     "                            break #is this an issue?\n", | ||||||
|  |     "                        else:\n", | ||||||
|  |     "                            continue\n", | ||||||
|  |     "                    elif node['max'] is None:\n", | ||||||
|  |     "                        if node['min'] < x[counter]:\n", | ||||||
|  |     "                            class_set.append(node['classes'])\n", | ||||||
|  |     "                            paths_set.append(node[\"paths\"])\n", | ||||||
|  |     "                            break #is this an issue?\n", | ||||||
|  |     "                        else:\n", | ||||||
|  |     "                            continue\n", | ||||||
|  |     "                    elif node['min'] < x[counter] and x[counter] <= node['max']:\n", | ||||||
|  |     "                        class_set.append(node['classes'])\n", | ||||||
|  |     "                        paths_set.append(node[\"paths\"])\n", | ||||||
|  |     "                        break #is this an issue?\n", | ||||||
|  |     "\n", | ||||||
|  |     "            counter += 1\n", | ||||||
|  |     "        result = set(class_set[0])\n", | ||||||
|  |     "        paths = set(paths_set[0])\n", | ||||||
|  |     "        for s in class_set[1:]:\n", | ||||||
|  |     "            result.intersection_update(s)\n", | ||||||
|  |     "        for s in paths_set[1:]:\n", | ||||||
|  |     "            paths.intersection_update(s)\n", | ||||||
|  |     "\n", | ||||||
|  |     "        #predict_Yt.append(list(result))\n", | ||||||
|  |     "        #print(result)\n", | ||||||
|  |     "        if len(paths) != 1:\n", | ||||||
|  |     "            print(paths)\n", | ||||||
|  |     "            print(x)\n", | ||||||
|  |     "            print(result)\n", | ||||||
|  |     "        assert len(paths) == 1\n", | ||||||
|  |     "        path = list(paths)[0]\n", | ||||||
|  |     "        pred = data[\"path_to_class\"][str(path)]\n", | ||||||
|  |     "        pred_class = classes[pred]\n", | ||||||
|  |     "        predict_Yt.append(pred_class)\n", | ||||||
|  |     "        \n", | ||||||
|  |     "        index += 1" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 143, | ||||||
|  |    "id": "8b4c56b6", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "0.8410252791654538\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "correct = 0\n", | ||||||
|  |     "for i in range(len(Y)):\n", | ||||||
|  |     "    prediction = predict_Yt[i]\n", | ||||||
|  |     "    if prediction != None and Y[i] == prediction:\n", | ||||||
|  |     "        correct += 1\n", | ||||||
|  |     "\n", | ||||||
|  |     "print(correct / len(Y))" | ||||||
|  |    ] | ||||||
|  |   } | ||||||
|  |  ], | ||||||
|  |  "metadata": { | ||||||
|  |   "kernelspec": { | ||||||
|  |    "display_name": "Python 3", | ||||||
|  |    "language": "python", | ||||||
|  |    "name": "python3" | ||||||
|  |   }, | ||||||
|  |   "language_info": { | ||||||
|  |    "codemirror_mode": { | ||||||
|  |     "name": "ipython", | ||||||
|  |     "version": 3 | ||||||
|  |    }, | ||||||
|  |    "file_extension": ".py", | ||||||
|  |    "mimetype": "text/x-python", | ||||||
|  |    "name": "python", | ||||||
|  |    "nbconvert_exporter": "python", | ||||||
|  |    "pygments_lexer": "ipython3", | ||||||
|  |    "version": "3.12.7" | ||||||
|  |   } | ||||||
|  |  }, | ||||||
|  |  "nbformat": 4, | ||||||
|  |  "nbformat_minor": 5 | ||||||
|  | } | ||||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @@ -89,7 +89,7 @@ | |||||||
|  ], |  ], | ||||||
|  "metadata": { |  "metadata": { | ||||||
|   "kernelspec": { |   "kernelspec": { | ||||||
|    "display_name": "switch", |    "display_name": "Python 3 (ipykernel)", | ||||||
|    "language": "python", |    "language": "python", | ||||||
|    "name": "python3" |    "name": "python3" | ||||||
|   }, |   }, | ||||||
| @@ -103,7 +103,7 @@ | |||||||
|    "name": "python", |    "name": "python", | ||||||
|    "nbconvert_exporter": "python", |    "nbconvert_exporter": "python", | ||||||
|    "pygments_lexer": "ipython3", |    "pygments_lexer": "ipython3", | ||||||
|    "version": "3.12.7" |    "version": "3.12.9" | ||||||
|   } |   } | ||||||
|  }, |  }, | ||||||
|  "nbformat": 4, |  "nbformat": 4, | ||||||
|   | |||||||
							
								
								
									
										10
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,17 +2,21 @@ | |||||||
|  |  | ||||||
| Run `pip install -r requirements.txt` | Run `pip install -r requirements.txt` | ||||||
|  |  | ||||||
|  | Run `setup.sh` | ||||||
|  |  | ||||||
| # Tree Generation | # Tree Generation | ||||||
|  |  | ||||||
| ## Download Dataset | ## Download Dataset | ||||||
|  |  | ||||||
| Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc | Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc | ||||||
|  |  | ||||||
| Rename the file as data.pcap | Place these into the `data/tar` folder. | ||||||
|  |  | ||||||
|  | Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`. | ||||||
|  |  | ||||||
| ## Preprocessing Dataset | ## Preprocessing Dataset | ||||||
|  |  | ||||||
| Run `ExtractDataset.ipynb`, this will take a few minutes | Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree. | ||||||
|  |  | ||||||
| ## Training | ## Training | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ | |||||||
|  "cells": [ |  "cells": [ | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 1, |    "execution_count": 73, | ||||||
|    "id": "ec310f34", |    "id": "ec310f34", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -14,7 +14,7 @@ | |||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 2, |    "execution_count": 74, | ||||||
|    "id": "5b54797e", |    "id": "5b54797e", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -28,7 +28,7 @@ | |||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 3, |    "execution_count": 75, | ||||||
|    "id": "a38fdb8a", |    "id": "a38fdb8a", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -38,14 +38,14 @@ | |||||||
|     "i = 0\n", |     "i = 0\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "path_ids = set()\n", |     "path_ids = set()\n", | ||||||
|     "path_classes = set()\n", |     "path_classes = tree[\"classes\"]\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "# for each path in the tree\n", |     "# for each path in the tree\n", | ||||||
|     "for path in paths:\n", |     "for path in paths:\n", | ||||||
|     "\t# assign a path id \n", |     "\t# assign a path id \n", | ||||||
|     "\tpath[\"id\"] = i\n", |     "\tpath[\"id\"] = i\n", | ||||||
|     "\tpath_ids.add(i)\n", |     "\tpath_ids.add(i)\n", | ||||||
|     "\tpath_classes.add(path[\"classification\"])\n", |     "\t#path_classes.add(path[\"classification\"])\n", | ||||||
|     "\ti += 1\t\n", |     "\ti += 1\t\n", | ||||||
|     "\t# for each condition\n", |     "\t# for each condition\n", | ||||||
|     "\tconditions = path[\"conditions\"]\n", |     "\tconditions = path[\"conditions\"]\n", | ||||||
| @@ -60,7 +60,7 @@ | |||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 4, |    "execution_count": 76, | ||||||
|    "id": "2fd4f738", |    "id": "2fd4f738", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -83,7 +83,7 @@ | |||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 5, |    "execution_count": 77, | ||||||
|    "id": "98cde024", |    "id": "98cde024", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -109,13 +109,13 @@ | |||||||
|     "\t\tvalue = condition[\"value\"]\n", |     "\t\tvalue = condition[\"value\"]\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "\t\t# move the min/max for the corresponding feature in compressed\n", |     "\t\t# move the min/max for the corresponding feature in compressed\n", | ||||||
|     "\t\tif operation == \"<=\" and compressed[feature][\"min\"] is None:\n", |     "\t\tif operation == \"<=\" and compressed[feature][\"max\"] is None:\n", | ||||||
|     "\t\t\tcompressed[feature][\"max\"] = value\n", |     "\t\t\tcompressed[feature][\"max\"] = value\n", | ||||||
|     "\t\telif operation == \">\" and compressed[feature][\"max\"] is None:\n", |     "\t\telif operation == \">\" and compressed[feature][\"min\"] is None:\n", | ||||||
|     "\t\t\tcompressed[feature][\"min\"] = value\n", |     "\t\t\tcompressed[feature][\"min\"] = value\n", | ||||||
|     "\t\telif operation == \"<=\" and value < compressed[feature][\"min\"]:\n", |     "\t\telif operation == \"<=\" and value < compressed[feature][\"max\"]:\n", | ||||||
|     "\t\t\tcompressed[feature][\"max\"] = value\n", |     "\t\t\tcompressed[feature][\"max\"] = value\n", | ||||||
|     "\t\telif operation == \">\" and value > compressed[feature][\"max\"]:\n", |     "\t\telif operation == \">\" and value > compressed[feature][\"min\"]:\n", | ||||||
|     "\t\t\tcompressed[feature][\"min\"] = value\n", |     "\t\t\tcompressed[feature][\"min\"] = value\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "\tpath[\"compressed\"] = compressed" |     "\tpath[\"compressed\"] = compressed" | ||||||
| @@ -123,7 +123,7 @@ | |||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 6, |    "execution_count": 78, | ||||||
|    "id": "b6fbadbf", |    "id": "b6fbadbf", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -171,16 +171,19 @@ | |||||||
|     "\t\t# for each bucket which encompases the condition's range, add this path's id to the sets \n", |     "\t\t# for each bucket which encompases the condition's range, add this path's id to the sets \n", | ||||||
|     "\t\ti = 0\n", |     "\t\ti = 0\n", | ||||||
|     "\t\tfor bp in breakpoints[feature_name]:\n", |     "\t\tfor bp in breakpoints[feature_name]:\n", | ||||||
|     "\t\t\tin_range = is_in_range(bp, lower, upper)\n", |     "\t\t\tif is_in_range(bp, lower, upper):\n", | ||||||
|     "\t\t\tif in_range:\n", |  | ||||||
|     "\t\t\t\tbuckets_id[feature_name][i].add(ID)\n", |     "\t\t\t\tbuckets_id[feature_name][i].add(ID)\n", | ||||||
|     "\t\t\t\tbuckets_class[feature_name][i].add(Class)\n", |     "\t\t\t\tbuckets_class[feature_name][i].add(Class)\n", | ||||||
|     "\t\t\ti += 1" |     "\t\t\ti += 1\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\tif is_in_range(bp+1, lower, upper):\n", | ||||||
|  |     "\t\t\tbuckets_id[feature_name][i].add(ID)\n", | ||||||
|  |     "\t\t\tbuckets_class[feature_name][i].add(Class)" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 7, |    "execution_count": 79, | ||||||
|    "id": "0a767971", |    "id": "0a767971", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -198,28 +201,34 @@ | |||||||
|     "\tfor i in range(1, len(buckets_id[feature_name]) - 1):\n", |     "\tfor i in range(1, len(buckets_id[feature_name]) - 1):\n", | ||||||
|     "\t\tlower = breakpoints[feature_name][i-1]\n", |     "\t\tlower = breakpoints[feature_name][i-1]\n", | ||||||
|     "\t\tupper = breakpoints[feature_name][i]\n", |     "\t\tupper = breakpoints[feature_name][i]\n", | ||||||
|     "\t\tmembers = buckets_id[feature_name][i]\n", |     "\t\tpaths = buckets_id[feature_name][i]\n", | ||||||
|     "\t\tclasses = buckets_class[feature_name][i]\n", |     "\t\tclasses = buckets_class[feature_name][i]\n", | ||||||
|     "\t\t#print(f\"{feature_name} = [{lower}, {upper}]: {buckets[feature_name][i]}\")\n", |     "\t\t#print(f\"{feature_name} = [{lower}, {upper}]: {buckets[feature_name][i]}\")\n", | ||||||
|     "\t\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n", |     "\t\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n", | ||||||
|     "\tlower = breakpoints[feature_name][len(breakpoints[feature_name]) - 1]\n", |     "\tlower = breakpoints[feature_name][len(breakpoints[feature_name]) - 1]\n", | ||||||
|     "\tupper = None\n", |     "\tupper = None\n", | ||||||
|     "\tmembers = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n", |     "\tpaths = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n", | ||||||
|     "\tclasses = buckets_class[feature_name][len(buckets_class[feature_name]) - 1]\n", |     "\tclasses = buckets_class[feature_name][len(buckets_class[feature_name]) - 1]\n", | ||||||
|     "\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n", |     "\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n", | ||||||
|     "\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n", |     "\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n", | ||||||
|     "\t#print(\"=\"*40)\n", |     "\t#print(\"=\"*40)\n", | ||||||
|     "\n", |     "\n", | ||||||
|  |     "path_to_class = {}\n", | ||||||
|  |     "for i in range(len(tree[\"paths\"])):\n", | ||||||
|  |     "    path = tree[\"paths\"][i]\n", | ||||||
|  |     "    path_to_class[path[\"id\"]] = path[\"classification\"]\n", | ||||||
|  |     "\n", | ||||||
|     "compressed_tree = {\n", |     "compressed_tree = {\n", | ||||||
|     "\t\"paths\": path_ids,\n", |     "\t\"paths\": path_ids,\n", | ||||||
|     "\t\"classes\": path_classes,\n", |     "\t\"classes\": path_classes,\n", | ||||||
|     "\t\"layers\": compressed_layers,\n", |     "\t\"layers\": compressed_layers,\n", | ||||||
|  |     "    \"path_to_class\": path_to_class,\n", | ||||||
|     "}" |     "}" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": 8, |    "execution_count": 80, | ||||||
|    "id": "561b0bc1", |    "id": "561b0bc1", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
| @@ -238,7 +247,7 @@ | |||||||
|  ], |  ], | ||||||
|  "metadata": { |  "metadata": { | ||||||
|   "kernelspec": { |   "kernelspec": { | ||||||
|    "display_name": "switch", |    "display_name": "Python 3 (ipykernel)", | ||||||
|    "language": "python", |    "language": "python", | ||||||
|    "name": "python3" |    "name": "python3" | ||||||
|   }, |   }, | ||||||
| @@ -252,7 +261,7 @@ | |||||||
|    "name": "python", |    "name": "python", | ||||||
|    "nbconvert_exporter": "python", |    "nbconvert_exporter": "python", | ||||||
|    "pygments_lexer": "ipython3", |    "pygments_lexer": "ipython3", | ||||||
|    "version": "3.12.7" |    "version": "3.12.9" | ||||||
|   } |   } | ||||||
|  }, |  }, | ||||||
|  "nbformat": 4, |  "nbformat": 4, | ||||||
|   | |||||||
							
								
								
									
										412
									
								
								TreeToRMT.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										412
									
								
								TreeToRMT.ipynb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,412 @@ | |||||||
|  | { | ||||||
|  |  "cells": [ | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 1, | ||||||
|  |    "id": "58fc6db9", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "import json\n", | ||||||
|  |     "import math" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 2, | ||||||
|  |    "id": "e07be4b3", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "f = open(\"compressed_tree.json\")\n", | ||||||
|  |     "tree = json.loads(f.read())\n", | ||||||
|  |     "layers = tree[\"layers\"]\n", | ||||||
|  |     "classes = tree[\"classes\"]\n", | ||||||
|  |     "f.close()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 3, | ||||||
|  |    "id": "1516ff91", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "field_width = {\n", | ||||||
|  |     "\t\"src\": 16,\n", | ||||||
|  |     "\t\"dst\": 16,\n", | ||||||
|  |     "\t\"protocl\": 8,\n", | ||||||
|  |     "}" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "f9193827", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "# Worst Case RMT" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 4, | ||||||
|  |    "id": "5e37cfc5", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def worst_case_rmt(tree):\n", | ||||||
|  |     "\trmt = []\n", | ||||||
|  |     "\tstep = 0\n", | ||||||
|  |     "\n", | ||||||
|  |     "\ttcam_bits = 0\n", | ||||||
|  |     "\tram_bits = 0\n", | ||||||
|  |     "\n", | ||||||
|  |     "\tfor layer in layers:\n", | ||||||
|  |     "\t\tnum_ranges = len(layers[layer])\n", | ||||||
|  |     "\t\t# assume that each range requires all of 2*k prefixes when performing prefix expansion\n", | ||||||
|  |     "\t\t# therefore there are 2*k * R for R ranges and width k\n", | ||||||
|  |     "\t\tnum_prefixes = 2 * field_width[layer] * num_ranges\n", | ||||||
|  |     "\t\tprefix_width = field_width[layer]\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\ttcam = {\n", | ||||||
|  |     "\t\t\t\"id\": f\"{layer}_range\",\n", | ||||||
|  |     "\t\t\t\"step\": step,\n", | ||||||
|  |     "\t\t\t\"match\": \"ternary\",\n", | ||||||
|  |     "\t\t\t\"entries\": num_prefixes,\n", | ||||||
|  |     "\t\t\t\"key_size\": prefix_width\n", | ||||||
|  |     "\t\t}\n", | ||||||
|  |     "\t\ttcam_bits += num_prefixes * prefix_width\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\t# assume basic pointer reuse for metadata storage\n", | ||||||
|  |     "\t\tram = {\n", | ||||||
|  |     "\t\t\t\"id\": f\"{layer}_meta\",\n", | ||||||
|  |     "\t\t\t\"step\": step,\n", | ||||||
|  |     "\t\t\t\"match\": \"exact\",\n", | ||||||
|  |     "\t\t\t\"method\": \"index\",\n", | ||||||
|  |     "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n", | ||||||
|  |     "\t\t\t\"data_size\": len(classes)\n", | ||||||
|  |     "\t\t}\n", | ||||||
|  |     "\t\tram_bits += num_ranges * len(classes)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\trmt.append(tcam)\n", | ||||||
|  |     "\t\trmt.append(ram)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\tstep += 1\n", | ||||||
|  |     "\n", | ||||||
|  |     "\treturn rmt, tcam_bits, ram_bits\n", | ||||||
|  |     "\n", | ||||||
|  |     "x, tcam_bits, ram_bits = worst_case_rmt(tree)\n", | ||||||
|  |     "f = open(\"worst_case_rmt.json\", \"w+\")\n", | ||||||
|  |     "f.write(json.dumps(x, indent=4))\n", | ||||||
|  |     "f.close()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 5, | ||||||
|  |    "id": "0dc1d6d4", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "TCAM mapping: \n", | ||||||
|  |       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||||
|  |       "SRAM mapping: \n", | ||||||
|  |       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||||
|  |       "id mapping: \n", | ||||||
|  |       "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n", | ||||||
|  |       "TCAM bits: 13184\n", | ||||||
|  |       "RAM bits:  504\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n", | ||||||
|  |     "print(f\"TCAM bits: {tcam_bits}\")\n", | ||||||
|  |     "print(f\"RAM bits:  {ram_bits}\")" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "2a628655", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "# Naive Range Expansion " | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 6, | ||||||
|  |    "id": "fb9febe9", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# shamelessly stolen from: https://github.com/autolyticus/range-to-prefix/blob/master/rangetoprefix.C\n", | ||||||
|  |     "\n", | ||||||
|  |     "def int_to_bin(i, width):\n", | ||||||
|  |     "\treturn bin(i)[2:].zfill(width)\n", | ||||||
|  |     "\n", | ||||||
|  |     "def increment_dc(pfx):\n", | ||||||
|  |     "\tidx = pfx.find(\"*\")\n", | ||||||
|  |     "\tif idx == -1:\n", | ||||||
|  |     "\t\tidx = len(pfx)\n", | ||||||
|  |     "\tidx = idx - 1\n", | ||||||
|  |     "\t#print(pfx, pfx[:idx])\n", | ||||||
|  |     "\treturn pfx[:idx] + \"*\" + pfx[idx+1:]\n", | ||||||
|  |     "\t\n", | ||||||
|  |     "def can_merge(pfx_a, pfx_b):\n", | ||||||
|  |     "\tpfx_a = pfx_a.replace(\"*\", \"\")\n", | ||||||
|  |     "\tpfx_b = pfx_b.replace(\"*\", \"\")\n", | ||||||
|  |     "\treturn pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1]\n", | ||||||
|  |     "\n", | ||||||
|  |     "def merge(pfx_a, prefixes):\n", | ||||||
|  |     "\tpfx_a = increment_dc(pfx_a)\n", | ||||||
|  |     "\tprefixes[-1] = pfx_a\n", | ||||||
|  |     "\n", | ||||||
|  |     "\tfor i in range(len(prefixes) - 2, -1, -1):\n", | ||||||
|  |     "\t\tif can_merge(prefixes[i], prefixes[i+1]):\n", | ||||||
|  |     "\t\t\tprefixes.pop()\n", | ||||||
|  |     "\t\t\tpfx = increment_dc(prefixes[i])\n", | ||||||
|  |     "\t\t\tprefixes[i] = pfx\n", | ||||||
|  |     "\n", | ||||||
|  |     "def convert_range(lower, upper, width):\n", | ||||||
|  |     "\tprefixes = []\n", | ||||||
|  |     "\tprefix = int_to_bin(lower, width)\n", | ||||||
|  |     "\tprefixes.append(prefix)\n", | ||||||
|  |     "\tnorm_upper = min(upper, 2**width-1)\n", | ||||||
|  |     "\tfor i in range(lower+1, norm_upper+1):\n", | ||||||
|  |     "\t\tprefix = int_to_bin(i, width)\n", | ||||||
|  |     "\t\tif can_merge(prefix, prefixes[-1]):\n", | ||||||
|  |     "\t\t\tmerge(prefix, prefixes)\n", | ||||||
|  |     "\t\telse:\n", | ||||||
|  |     "\t\t\tprefixes.append(prefix)\n", | ||||||
|  |     "\treturn prefixes" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 7, | ||||||
|  |    "id": "55167c28", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def naive_rmt(tree):\n", | ||||||
|  |     "\trmt = []\n", | ||||||
|  |     "\tstep = 0\n", | ||||||
|  |     "\n", | ||||||
|  |     "\ttcam_bits = 0\n", | ||||||
|  |     "\tram_bits = 0\n", | ||||||
|  |     "\n", | ||||||
|  |     "\tfor layer in layers:\n", | ||||||
|  |     "\t\tnum_prefixes = 0\n", | ||||||
|  |     "\t\tprefix_width = field_width[layer]\n", | ||||||
|  |     "\t\t# for each range in the layer, convert the ranges to prefixes using naive range expansion\n", | ||||||
|  |     "\t\tfor r in layers[layer]:\n", | ||||||
|  |     "\t\t\tif r[\"min\"] == None:\n", | ||||||
|  |     "\t\t\t\tr[\"min\"] = 0\n", | ||||||
|  |     "\t\t\telif r[\"max\"] == None:\n", | ||||||
|  |     "\t\t\t\tr[\"max\"] = 2 ** prefix_width\n", | ||||||
|  |     "\t\t\tprefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n", | ||||||
|  |     "\t\t\tr[\"prefixes\"] = prefixes\n", | ||||||
|  |     "\t\t\tnum_prefixes += len(prefixes)\n", | ||||||
|  |     "\t\t\ttcam_bits += len(prefixes) * prefix_width\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\ttcam = {\n", | ||||||
|  |     "\t\t\t\"id\": f\"{layer}_range\",\n", | ||||||
|  |     "\t\t\t\"step\": step,\n", | ||||||
|  |     "\t\t\t\"match\": \"ternary\",\n", | ||||||
|  |     "\t\t\t\"entries\": num_prefixes,\n", | ||||||
|  |     "\t\t\t\"key_size\": prefix_width,\n", | ||||||
|  |     "\t\t\t\"ranges\": layers[layer]\n", | ||||||
|  |     "\t\t}\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\tnum_ranges = len(layers[layer])\n", | ||||||
|  |     "\t\t# assume no pointer reuse for metadata storage\n", | ||||||
|  |     "\t\tram = {\n", | ||||||
|  |     "\t\t\t\"id\": f\"{layer}_meta\",\n", | ||||||
|  |     "\t\t\t\"step\": step,\n", | ||||||
|  |     "\t\t\t\"match\": \"exact\",\n", | ||||||
|  |     "\t\t\t\"method\": \"index\",\n", | ||||||
|  |     "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n", | ||||||
|  |     "\t\t\t\"data_size\": len(classes)\n", | ||||||
|  |     "\t\t}\n", | ||||||
|  |     "\t\tram_bits += num_ranges * len(classes)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\trmt.append(tcam)\n", | ||||||
|  |     "\t\trmt.append(ram)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\tstep += 1\n", | ||||||
|  |     "\n", | ||||||
|  |     "\treturn rmt, tcam_bits, ram_bits\n", | ||||||
|  |     "\n", | ||||||
|  |     "x, tcam_bits, ram_bits = naive_rmt(tree)\n", | ||||||
|  |     "f = open(\"naive_rmt.json\", \"w+\")\n", | ||||||
|  |     "f.write(json.dumps(x, indent=4))\n", | ||||||
|  |     "f.close()\n" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 8, | ||||||
|  |    "id": "48011528", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "TCAM mapping: \n", | ||||||
|  |       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||||
|  |       "SRAM mapping: \n", | ||||||
|  |       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||||
|  |       "id mapping: \n", | ||||||
|  |       "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n", | ||||||
|  |       "TCAM bits: 3320\n", | ||||||
|  |       "RAM bits:  504\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n", | ||||||
|  |     "print(f\"TCAM bits: {tcam_bits}\")\n", | ||||||
|  |     "print(f\"RAM bits:  {ram_bits}\")" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "2504b1ba", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "# Priority Aware Prefix Expansion" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 9, | ||||||
|  |    "id": "64b7271e", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# for this technique, we note that given disjoint ranges [0,a][a,b],[b,c] ...\n", | ||||||
|  |     "# then if using a TCAM that selects the first matching prefix, then [0,a],[0,b],[0,c] would be equivalent\n", | ||||||
|  |     "# this is because if for some k<a, even though the range [0,b] could be selected, as long as the prefixes for [0,a] are before [0,b] then the correct prefix will still be selected\n", | ||||||
|  |     "\n", | ||||||
|  |     "def priority_aware(tree):\n", | ||||||
|  |     "\trmt = []\n", | ||||||
|  |     "\tstep = 0\n", | ||||||
|  |     "\n", | ||||||
|  |     "\ttcam_bits = 0\n", | ||||||
|  |     "\tram_bits = 0\n", | ||||||
|  |     "\n", | ||||||
|  |     "\tfor layer in layers:\n", | ||||||
|  |     "\t\tnum_prefixes = 0\n", | ||||||
|  |     "\t\tprefix_width = field_width[layer]\n", | ||||||
|  |     "\t\t# for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0\n", | ||||||
|  |     "\t\t# then check which set of prefixes would be better\n", | ||||||
|  |     "\t\t# we will assume the ranges are already disjoin and in the correct order\n", | ||||||
|  |     "\t\tfor r in layers[layer]:\n", | ||||||
|  |     "\t\t\tif r[\"min\"] == None:\n", | ||||||
|  |     "\t\t\t\tr[\"min\"] = 0\n", | ||||||
|  |     "\t\t\telif r[\"max\"] == None:\n", | ||||||
|  |     "\t\t\t\tr[\"max\"] = 2 ** prefix_width\n", | ||||||
|  |     "\t\t\tregular_prefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n", | ||||||
|  |     "\t\t\tzero_start_prefixes = convert_range(0, r[\"max\"], prefix_width)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\t\tif len(regular_prefixes) <= len(zero_start_prefixes):\n", | ||||||
|  |     "\t\t\t\tpfx_type = \"exact\"\n", | ||||||
|  |     "\t\t\t\tprefixes = regular_prefixes\n", | ||||||
|  |     "\t\t\telse:\n", | ||||||
|  |     "\t\t\t\tpfx_type = \"zero\"\n", | ||||||
|  |     "\t\t\t\tprefixes = zero_start_prefixes\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\t\tr[\"prefixes\"] = prefixes\n", | ||||||
|  |     "\t\t\tr[\"prefix_type\"] = pfx_type\n", | ||||||
|  |     "\t\t\tnum_prefixes += len(prefixes)\n", | ||||||
|  |     "\t\t\ttcam_bits += len(prefixes) * prefix_width\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\ttcam = {\n", | ||||||
|  |     "\t\t\t\"id\": f\"{layer}_range\",\n", | ||||||
|  |     "\t\t\t\"step\": step,\n", | ||||||
|  |     "\t\t\t\"match\": \"ternary\",\n", | ||||||
|  |     "\t\t\t\"entries\": num_prefixes,\n", | ||||||
|  |     "\t\t\t\"key_size\": prefix_width,\n", | ||||||
|  |     "\t\t\t\"ranges\": layers[layer]\n", | ||||||
|  |     "\t\t}\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\tnum_ranges = len(layers[layer])\n", | ||||||
|  |     "\t\t# assume no pointer reuse for metadata storage\n", | ||||||
|  |     "\t\tram = {\n", | ||||||
|  |     "\t\t\t\"id\": f\"{layer}_meta\",\n", | ||||||
|  |     "\t\t\t\"step\": step,\n", | ||||||
|  |     "\t\t\t\"match\": \"exact\",\n", | ||||||
|  |     "\t\t\t\"method\": \"index\",\n", | ||||||
|  |     "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n", | ||||||
|  |     "\t\t\t\"data_size\": len(classes)\n", | ||||||
|  |     "\t\t}\n", | ||||||
|  |     "\t\tram_bits += num_ranges * len(classes)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\trmt.append(tcam)\n", | ||||||
|  |     "\t\trmt.append(ram)\n", | ||||||
|  |     "\n", | ||||||
|  |     "\t\tstep += 1\n", | ||||||
|  |     "\n", | ||||||
|  |     "\treturn rmt, tcam_bits, ram_bits\n", | ||||||
|  |     "\n", | ||||||
|  |     "x, tcam_bits, ram_bits = priority_aware(tree)\n", | ||||||
|  |     "f = open(\"priority_aware.json\", \"w+\")\n", | ||||||
|  |     "f.write(json.dumps(x, indent=4))\n", | ||||||
|  |     "f.close()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 10, | ||||||
|  |    "id": "cd706e41", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "TCAM mapping: \n", | ||||||
|  |       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||||
|  |       "SRAM mapping: \n", | ||||||
|  |       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||||
|  |       "id mapping: \n", | ||||||
|  |       "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n", | ||||||
|  |       "TCAM bits: 2152\n", | ||||||
|  |       "RAM bits:  504\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "! command python3 ideal-rmt-simulator/sim.py priority_aware.json\n", | ||||||
|  |     "print(f\"TCAM bits: {tcam_bits}\")\n", | ||||||
|  |     "print(f\"RAM bits:  {ram_bits}\")" | ||||||
|  |    ] | ||||||
|  |   } | ||||||
|  |  ], | ||||||
|  |  "metadata": { | ||||||
|  |   "kernelspec": { | ||||||
|  |    "display_name": "Python 3 (ipykernel)", | ||||||
|  |    "language": "python", | ||||||
|  |    "name": "python3" | ||||||
|  |   }, | ||||||
|  |   "language_info": { | ||||||
|  |    "codemirror_mode": { | ||||||
|  |     "name": "ipython", | ||||||
|  |     "version": 3 | ||||||
|  |    }, | ||||||
|  |    "file_extension": ".py", | ||||||
|  |    "mimetype": "text/x-python", | ||||||
|  |    "name": "python", | ||||||
|  |    "nbconvert_exporter": "python", | ||||||
|  |    "pygments_lexer": "ipython3", | ||||||
|  |    "version": "3.12.7" | ||||||
|  |   } | ||||||
|  |  }, | ||||||
|  |  "nbformat": 4, | ||||||
|  |  "nbformat_minor": 5 | ||||||
|  | } | ||||||
							
								
								
									
										74
									
								
								combine.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								combine.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,74 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | """combined.py | ||||||
|  |  | ||||||
|  | Concatenate every CSV that matches the pattern | ||||||
|  |     data/processed/<name>/<name>.csv | ||||||
|  | into a single file: | ||||||
|  |     data/combined/data.csv | ||||||
|  |  | ||||||
|  | The script streams each source CSV in 1‑Mio‑row chunks so memory stays low. | ||||||
|  | Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly. | ||||||
|  |  | ||||||
|  | Usage | ||||||
|  | ----- | ||||||
|  | python combined.py | ||||||
|  |  | ||||||
|  | You can optionally supply a different root directory: | ||||||
|  | python combined.py --root other/processed_dir --out other/combined/data.csv | ||||||
|  | """ | ||||||
|  | from __future__ import annotations | ||||||
|  |  | ||||||
|  | import argparse | ||||||
|  | from pathlib import Path | ||||||
|  | import os | ||||||
|  | import pandas as pd | ||||||
|  |  | ||||||
|  | CHUNK = 1_000_000  # rows per read_csv chunk | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fix_cols(df: pd.DataFrame) -> pd.DataFrame: | ||||||
|  |     """Rename legacy columns to canonical names.""" | ||||||
|  |     return df.rename( | ||||||
|  |         columns={"protocl": "protocol", "classfication": "classification"} | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def find_source_csvs(proc_root: Path): | ||||||
|  |     """Yield CSV paths that exactly match processed/<name>/<name>.csv.""" | ||||||
|  |     for sub in sorted(proc_root.iterdir()): | ||||||
|  |         if not sub.is_dir(): | ||||||
|  |             continue | ||||||
|  |         target = sub / f"{sub.name}.csv" | ||||||
|  |         if target.exists(): | ||||||
|  |             yield target | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def combine(proc_root: Path, out_path: Path): | ||||||
|  |     out_path.parent.mkdir(parents=True, exist_ok=True) | ||||||
|  |  | ||||||
|  |     first_write = True | ||||||
|  |     for csv_path in find_source_csvs(proc_root): | ||||||
|  |         print(f"→ adding {csv_path.relative_to(proc_root.parent)}") | ||||||
|  |         for chunk in pd.read_csv(csv_path, chunksize=CHUNK): | ||||||
|  |             chunk = fix_cols(chunk) | ||||||
|  |             chunk.to_csv( | ||||||
|  |                 out_path, | ||||||
|  |                 mode="w" if first_write else "a", | ||||||
|  |                 header=first_write, | ||||||
|  |                 index=False, | ||||||
|  |             ) | ||||||
|  |             first_write = False | ||||||
|  |     print(f"✓ combined CSV written to {out_path}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     p = argparse.ArgumentParser(description="Combine processed CSVs into one.") | ||||||
|  |     p.add_argument("--root", default="data/processed", help="processed dir root") | ||||||
|  |     p.add_argument("--out", default="data/combined/data.csv", help="output CSV") | ||||||
|  |     args = p.parse_args() | ||||||
|  |  | ||||||
|  |     combine(Path(args.root).expanduser(), Path(args.out).expanduser()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										80
									
								
								extract_all_datasets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								extract_all_datasets.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | from pathlib import Path | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | from labels import mac_to_label | ||||||
|  | from tqdm import tqdm | ||||||
|  | import os | ||||||
|  |  | ||||||
|  | ROOT       = Path(__file__).resolve().parent | ||||||
|  | PCAP_DIR   = ROOT / "data" / "pcap" | ||||||
|  | CSV_DIR    = ROOT / "data" / "processed" | ||||||
|  | CSV_DIR.mkdir(parents=True, exist_ok=True) | ||||||
|  |  | ||||||
|  | BATCH = 100_000   # packets per chunk | ||||||
|  |  | ||||||
|  | from scapy.all import rdpcap | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def process_pcap(pcap_path: str, csv_path: str) -> None: | ||||||
|  |     all_packets = rdpcap(pcap_path) | ||||||
|  |  | ||||||
|  |     print("rdpcap done", flush=True) | ||||||
|  |     results = [] | ||||||
|  |     for packet in tqdm(all_packets): | ||||||
|  |         size = len(packet) | ||||||
|  |         try: | ||||||
|  |             proto = packet.proto | ||||||
|  |         except AttributeError: | ||||||
|  |             proto = 0 | ||||||
|  |         try: | ||||||
|  |             sport = packet.sport | ||||||
|  |             dport = packet.dport | ||||||
|  |         except AttributeError: | ||||||
|  |             sport = 0 | ||||||
|  |             dport = 0 | ||||||
|  |  | ||||||
|  |         proto = int(proto) | ||||||
|  |         sport = int(sport) | ||||||
|  |         dport = int(dport) | ||||||
|  |  | ||||||
|  |         if "Ether" in packet: | ||||||
|  |             eth_dst = packet["Ether"].dst | ||||||
|  |             if eth_dst in mac_to_label: | ||||||
|  |                 classification = mac_to_label[eth_dst] | ||||||
|  |             else: | ||||||
|  |                 classification = "other" | ||||||
|  |         else: | ||||||
|  |             classification = "other" | ||||||
|  |  | ||||||
|  |         metric = [proto,sport,dport,classification] | ||||||
|  |         results.append(metric) | ||||||
|  |     results = (np.array(results)).T | ||||||
|  |  | ||||||
|  |     # store the features in the dataframe | ||||||
|  |     dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]}) | ||||||
|  |     columns = ['protocl','src','dst','classfication'] | ||||||
|  |  | ||||||
|  |     # save the dataframe to the csv file, if not exsit, create one. | ||||||
|  |     if os.path.exists(csv_path): | ||||||
|  |         dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False) | ||||||
|  |     else: | ||||||
|  |         dataframe.to_csv(csv_path,index=False,sep=',',columns = columns) | ||||||
|  |          | ||||||
|  |     print("Done") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main() -> None: | ||||||
|  |     for pcap in sorted(PCAP_DIR.rglob("*.pcap")): | ||||||
|  |         rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv") | ||||||
|  |         csv_path = CSV_DIR / rel_csv | ||||||
|  |         if csv_path.exists(): | ||||||
|  |             print(f"Skip {rel_csv} (CSV exists)") | ||||||
|  |             continue | ||||||
|  |         print(f"Processing {rel_csv}") | ||||||
|  |         csv_path.parent.mkdir(parents=True, exist_ok=True) | ||||||
|  |         process_pcap(str(pcap), str(csv_path)) | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										50
									
								
								extract_tars.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								extract_tars.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | # Usage: extract_all.sh SOURCE_DIR TARGET_DIR | ||||||
|  | # For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR: | ||||||
|  | #   1. Create TARGET_DIR/<name>/ | ||||||
|  | #   2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive. | ||||||
|  | #   3. Otherwise, extract the archive into its own folder. | ||||||
|  |  | ||||||
|  | set -euo pipefail | ||||||
|  |  | ||||||
|  | if [[ $# -ne 2 ]]; then | ||||||
|  |   echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2 | ||||||
|  |   exit 1 | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | src_dir="$1" | ||||||
|  | dst_dir="$2" | ||||||
|  | mkdir -p "$dst_dir" | ||||||
|  |  | ||||||
|  | # Strip common extensions to recover the base name | ||||||
|  | strip_ext() { | ||||||
|  |   local n="$1" | ||||||
|  |   n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar} | ||||||
|  |   echo "$n" | ||||||
|  | } | ||||||
|  |  | ||||||
|  | shopt -s nullglob | ||||||
|  | for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do | ||||||
|  |   base=$(basename "$archive") | ||||||
|  |   name=$(strip_ext "$base") | ||||||
|  |   out_dir="$dst_dir/$name" | ||||||
|  |   key_file="$out_dir/$name.pcap" | ||||||
|  |  | ||||||
|  |   if [[ -f "$key_file" ]]; then | ||||||
|  |     echo "Skipping $archive  —  $key_file already present" | ||||||
|  |     continue | ||||||
|  |   fi | ||||||
|  |  | ||||||
|  |   echo "Extracting $archive into $out_dir" | ||||||
|  |   mkdir -p "$out_dir" | ||||||
|  |  | ||||||
|  |   case "$archive" in | ||||||
|  |     *.tar)          tar -xf "$archive" -C "$out_dir" ;; | ||||||
|  |     *.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;; | ||||||
|  |     *.tar.bz2)      tar -xjf "$archive" -C "$out_dir" ;; | ||||||
|  |     *.tar.xz)       tar -xJf "$archive" -C "$out_dir" ;; | ||||||
|  |     *)              echo "Unknown type: $archive" ;; | ||||||
|  |   esac | ||||||
|  | done | ||||||
|  |  | ||||||
|  | echo "All archives processed." | ||||||
| @@ -4,3 +4,4 @@ pandas | |||||||
| scikit-learn | scikit-learn | ||||||
| pydotplus | pydotplus | ||||||
| matplotlib | matplotlib | ||||||
|  | scipy | ||||||
							
								
								
									
										168
									
								
								run/decision_tree.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								run/decision_tree.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,168 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | """ | ||||||
|  | Train a decision tree, optionally “nudge” its split thresholds, and | ||||||
|  | export the result as JSON. | ||||||
|  |  | ||||||
|  | Usage examples | ||||||
|  | -------------- | ||||||
|  | # plain training, no nudging | ||||||
|  | python build_tree.py --input data/combined/data.csv --output tree.json | ||||||
|  |  | ||||||
|  | # nudge every internal threshold, keeping only the top-2 bits | ||||||
|  | python build_tree.py --input data/combined/data.csv --output tree.json \ | ||||||
|  |                      --nudge --bits 2 | ||||||
|  | """ | ||||||
|  | import argparse | ||||||
|  | import copy | ||||||
|  | import json | ||||||
|  | import math | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | from sklearn.metrics import accuracy_score | ||||||
|  | from sklearn.tree import DecisionTreeClassifier, _tree | ||||||
|  |  | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | # 1. command-line arguments | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | parser = argparse.ArgumentParser() | ||||||
|  | parser.add_argument("--input",  "-i", help="CSV file with protocol,src,dst,label", default="../data/combined/data.csv") | ||||||
|  | parser.add_argument("--output", "-o", help="Path for the exported JSON tree", default="tree.json") | ||||||
|  | parser.add_argument("--depth",  "-d", type=int, default=5, | ||||||
|  |                     help="Max depth of the decision tree (default: 5)") | ||||||
|  | parser.add_argument("--nudge",  action="store_true", | ||||||
|  |                     help="Enable threshold nudging") | ||||||
|  | parser.add_argument("--bits",   type=int, default=2, | ||||||
|  |                     help="Number of bits to keep when nudging (default: 2)") | ||||||
|  | args = parser.parse_args() | ||||||
|  |  | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | # 2. helper functions | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | def nudge_threshold_max_n_bits(threshold: float, n_bits: int) -> int: | ||||||
|  |     """Remove n bits from each""" | ||||||
|  |     threshold = math.floor(threshold) | ||||||
|  |     if n_bits == 0: | ||||||
|  |         return threshold | ||||||
|  |      | ||||||
|  |     mask = pow(2, 32) - 1 ^ ((1 << n_bits) - 1) | ||||||
|  |     nudged_value = threshold & mask | ||||||
|  |     if threshold & (1 << (n_bits - 1)): | ||||||
|  |         nudged_value += (1 << (n_bits)) | ||||||
|  |              | ||||||
|  |     return nudged_value | ||||||
|  |  | ||||||
|  | def apply_nudging(tree: _tree.Tree, node_idx: int, n_bits: int) -> None: | ||||||
|  |     """Post-order traversal that nudges every internal node’s threshold.""" | ||||||
|  |     flag = False | ||||||
|  |     if tree.children_left[node_idx] != -1: | ||||||
|  |         apply_nudging(tree, tree.children_left[node_idx], n_bits) | ||||||
|  |         flag = True | ||||||
|  |     if tree.children_right[node_idx] != -1: | ||||||
|  |         apply_nudging(tree, tree.children_right[node_idx], n_bits) | ||||||
|  |         flag = True | ||||||
|  |     if flag:    # internal node | ||||||
|  |         tree.threshold[node_idx] = nudge_threshold_max_n_bits( | ||||||
|  |             tree.threshold[node_idx], n_bits | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  | # output the tree | ||||||
|  | def get_lineage(tree, feature_names): | ||||||
|  |     data = {"features": {}, "paths": [], "classes": list(tree.classes_)} | ||||||
|  |  | ||||||
|  |     thresholds = tree.tree_.threshold | ||||||
|  |     features   = [feature_names[i] for i in tree.tree_.feature] | ||||||
|  |     left       = tree.tree_.children_left | ||||||
|  |     right      = tree.tree_.children_right | ||||||
|  |     value      = tree.tree_.value | ||||||
|  |  | ||||||
|  |     # -------- helper to climb up from a leaf to the root ----------- | ||||||
|  |     def recurse(left, right, child, lineage=None): | ||||||
|  |         if lineage is None: | ||||||
|  |             lineage = [child]          # leaf marker (an int) | ||||||
|  |         if child in left: | ||||||
|  |             parent = np.where(left == child)[0].item() | ||||||
|  |             split  = "l" | ||||||
|  |         elif child in right: | ||||||
|  |             parent = np.where(right == child)[0].item() | ||||||
|  |             split  = "r" | ||||||
|  |         else:                          # should never happen | ||||||
|  |             return lineage | ||||||
|  |  | ||||||
|  |         lineage.append((parent, split, thresholds[parent], features[parent])) | ||||||
|  |         if parent == 0: | ||||||
|  |             return list(reversed(lineage)) | ||||||
|  |         return recurse(left, right, parent, lineage) | ||||||
|  |  | ||||||
|  |     leaf_ids = np.where(left == -1)[0]             # indices of all leaves | ||||||
|  |     for path_id, leaf in enumerate(leaf_ids): | ||||||
|  |         clause = [] | ||||||
|  |  | ||||||
|  |         for node in recurse(left, right, leaf): | ||||||
|  |             if not isinstance(node, tuple):        # skip the leaf marker | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             direction, threshold, feature = node[1], node[2], node[3] | ||||||
|  |             if direction == "l": | ||||||
|  |                 clause.append( | ||||||
|  |                     {"feature": feature, "operation": "<=", "value": threshold} | ||||||
|  |                 ) | ||||||
|  |             else: | ||||||
|  |                 clause.append( | ||||||
|  |                     {"feature": feature, "operation": ">",  "value": threshold} | ||||||
|  |                 ) | ||||||
|  |  | ||||||
|  |         class_idx = int(np.argmax(value[leaf][0]))  # use the leaf itself | ||||||
|  |         data["paths"].append( | ||||||
|  |             {"conditions": clause, "classification": class_idx, "id": path_id} | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     # collect all thresholds per feature | ||||||
|  |     for i, feat in enumerate(features): | ||||||
|  |         if tree.tree_.feature[i] != _tree.TREE_UNDEFINED: | ||||||
|  |             data["features"].setdefault(feat, []).append(thresholds[i]) | ||||||
|  |  | ||||||
|  |     return data | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class SetEncoder(json.JSONEncoder): | ||||||
|  |     def default(self, obj): | ||||||
|  |         if isinstance(obj, set): | ||||||
|  |             return list(obj) | ||||||
|  |         return super().default(obj) | ||||||
|  |  | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | # 3. load data | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | df = pd.read_csv(args.input) | ||||||
|  | X = df.iloc[:, :3].to_numpy() | ||||||
|  | Y = df.iloc[:, 3].to_numpy() | ||||||
|  |  | ||||||
|  | print(f"dataset size: {len(X)}") | ||||||
|  |  | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | # 4. train the tree | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | dt = DecisionTreeClassifier(max_depth=args.depth) | ||||||
|  | dt.fit(X, Y) | ||||||
|  | print("train accuracy (before nudging):", | ||||||
|  |       accuracy_score(Y, dt.predict(X))) | ||||||
|  |  | ||||||
|  | if args.nudge: | ||||||
|  |     nudged_tree = copy.deepcopy(dt.tree_) | ||||||
|  |     apply_nudging(nudged_tree, 0, args.bits) | ||||||
|  |     dt.tree_ = nudged_tree | ||||||
|  |     print(f"nudging enabled, removed bottom {args.bits} bit(s) per threshold") | ||||||
|  |  | ||||||
|  |     print("train accuracy (after  nudging):", | ||||||
|  |         accuracy_score(Y, dt.predict(X))) | ||||||
|  |  | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | # 5. export | ||||||
|  | # ---------------------------------------------------------------------- | ||||||
|  | lineage = get_lineage(dt, df.columns[:3]) | ||||||
|  |  | ||||||
|  | output_path = Path(args.output) | ||||||
|  | output_path.write_text(json.dumps(lineage, indent=4, cls=SetEncoder)) | ||||||
|  | print(f"Wrote tree to {output_path.resolve()}") | ||||||
							
								
								
									
										7
									
								
								run/print.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								run/print.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,7 @@ | |||||||
|  | import json | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | for file in Path("results/compressed_tree/").glob("*.json"): | ||||||
|  |     with open(file, "r") as f: | ||||||
|  |         s = json.load(f) | ||||||
|  |         print(max(s["paths"])+1) | ||||||
							
								
								
									
										36
									
								
								run/rmt.bat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								run/rmt.bat
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | |||||||
|  | @echo off | ||||||
|  | REM ------------------------------------------------------------- | ||||||
|  | REM Batch-script to evaluate all compressed trees with every mode | ||||||
|  | REM ------------------------------------------------------------- | ||||||
|  | setlocal EnableDelayedExpansion | ||||||
|  |  | ||||||
|  | REM --- where the trees live and where to store results ---------- | ||||||
|  | set TREEDIR=results\compressed_tree | ||||||
|  | set OUTDIR=results\rmt | ||||||
|  |  | ||||||
|  | REM --- python executable (adjust if needed) --------------------- | ||||||
|  | set PY=python | ||||||
|  |  | ||||||
|  | REM --- which modes to run -------------------------------------- | ||||||
|  | set MODELIST=naive priority | ||||||
|  | REM ------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | if not exist "%OUTDIR%" mkdir "%OUTDIR%" | ||||||
|  |  | ||||||
|  | for %%F in ("%TREEDIR%\*.json") do ( | ||||||
|  |     REM strip path → get file name without extension | ||||||
|  |     set BASE=%%~nF | ||||||
|  |  | ||||||
|  |     for %%M in (%MODELIST%) do ( | ||||||
|  |         echo Processing %%~nxF with mode %%M | ||||||
|  |  | ||||||
|  |         "%PY%" tree_to_rmt.py ^ | ||||||
|  |             --mode %%M ^ | ||||||
|  |             --input "%%F" ^ | ||||||
|  |             --output "%OUTDIR%\!BASE!_%%M.json" | ||||||
|  |  | ||||||
|  |     ) | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | echo All runs complete. | ||||||
|  | pause | ||||||
							
								
								
									
										362
									
								
								run/rmt.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										362
									
								
								run/rmt.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,362 @@ | |||||||
|  | Processing compressed_tree_d10_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d10_b0_naive.json | ||||||
|  | TCAM bits: 30336 | ||||||
|  | RAM bits:  6888 | ||||||
|  | Processing compressed_tree_d10_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d10_b0_priority.json | ||||||
|  | TCAM bits: 26648 | ||||||
|  | RAM bits:  6888 | ||||||
|  | Processing compressed_tree_d10_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d10_b1_naive.json | ||||||
|  | TCAM bits: 29936 | ||||||
|  | RAM bits:  6531 | ||||||
|  | Processing compressed_tree_d10_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d10_b1_priority.json | ||||||
|  | TCAM bits: 27120 | ||||||
|  | RAM bits:  6531 | ||||||
|  | Processing compressed_tree_d10_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d10_b3_naive.json | ||||||
|  | TCAM bits: 21712 | ||||||
|  | RAM bits:  5649 | ||||||
|  | Processing compressed_tree_d10_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d10_b3_priority.json | ||||||
|  | TCAM bits: 20048 | ||||||
|  | RAM bits:  5649 | ||||||
|  | Processing compressed_tree_d11_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d11_b0_naive.json | ||||||
|  | TCAM bits: 41248 | ||||||
|  | RAM bits:  10332 | ||||||
|  | Processing compressed_tree_d11_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d11_b0_priority.json | ||||||
|  | TCAM bits: 37592 | ||||||
|  | RAM bits:  10332 | ||||||
|  | Processing compressed_tree_d11_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d11_b1_naive.json | ||||||
|  | TCAM bits: 41072 | ||||||
|  | RAM bits:  9744 | ||||||
|  | Processing compressed_tree_d11_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d11_b1_priority.json | ||||||
|  | TCAM bits: 38256 | ||||||
|  | RAM bits:  9744 | ||||||
|  | Processing compressed_tree_d11_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d11_b3_naive.json | ||||||
|  | TCAM bits: 28464 | ||||||
|  | RAM bits:  8190 | ||||||
|  | Processing compressed_tree_d11_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d11_b3_priority.json | ||||||
|  | TCAM bits: 26928 | ||||||
|  | RAM bits:  8190 | ||||||
|  | Processing compressed_tree_d12_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d12_b0_naive.json | ||||||
|  | TCAM bits: 55680 | ||||||
|  | RAM bits:  15393 | ||||||
|  | Processing compressed_tree_d12_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d12_b0_priority.json | ||||||
|  | TCAM bits: 51592 | ||||||
|  | RAM bits:  15393 | ||||||
|  | Processing compressed_tree_d12_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d12_b1_naive.json | ||||||
|  | TCAM bits: 54240 | ||||||
|  | RAM bits:  14175 | ||||||
|  | Processing compressed_tree_d12_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d12_b1_priority.json | ||||||
|  | TCAM bits: 51200 | ||||||
|  | RAM bits:  14175 | ||||||
|  | Processing compressed_tree_d12_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d12_b3_naive.json | ||||||
|  | TCAM bits: 36048 | ||||||
|  | RAM bits:  11361 | ||||||
|  | Processing compressed_tree_d12_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d12_b3_priority.json | ||||||
|  | TCAM bits: 34416 | ||||||
|  | RAM bits:  11361 | ||||||
|  | Processing compressed_tree_d13_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d13_b0_naive.json | ||||||
|  | TCAM bits: 73152 | ||||||
|  | RAM bits:  22680 | ||||||
|  | Processing compressed_tree_d13_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d13_b0_priority.json | ||||||
|  | TCAM bits: 69096 | ||||||
|  | RAM bits:  22680 | ||||||
|  | Processing compressed_tree_d13_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d13_b1_naive.json | ||||||
|  | TCAM bits: 71024 | ||||||
|  | RAM bits:  20643 | ||||||
|  | Processing compressed_tree_d13_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d13_b1_priority.json | ||||||
|  | TCAM bits: 68160 | ||||||
|  | RAM bits:  20643 | ||||||
|  | Processing compressed_tree_d13_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d13_b3_naive.json | ||||||
|  | TCAM bits: 45152 | ||||||
|  | RAM bits:  16002 | ||||||
|  | Processing compressed_tree_d13_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d13_b3_priority.json | ||||||
|  | TCAM bits: 43600 | ||||||
|  | RAM bits:  16002 | ||||||
|  | Processing compressed_tree_d14_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d14_b0_naive.json | ||||||
|  | TCAM bits: 95760 | ||||||
|  | RAM bits:  33012 | ||||||
|  | Processing compressed_tree_d14_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d14_b0_priority.json | ||||||
|  | TCAM bits: 91656 | ||||||
|  | RAM bits:  33012 | ||||||
|  | Processing compressed_tree_d14_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d14_b1_naive.json | ||||||
|  | TCAM bits: 93520 | ||||||
|  | RAM bits:  29862 | ||||||
|  | Processing compressed_tree_d14_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d14_b1_priority.json | ||||||
|  | TCAM bits: 90544 | ||||||
|  | RAM bits:  29862 | ||||||
|  | Processing compressed_tree_d14_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d14_b3_naive.json | ||||||
|  | TCAM bits: 56144 | ||||||
|  | RAM bits:  21819 | ||||||
|  | Processing compressed_tree_d14_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d14_b3_priority.json | ||||||
|  | TCAM bits: 54544 | ||||||
|  | RAM bits:  21819 | ||||||
|  | Processing compressed_tree_d15_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d15_b0_naive.json | ||||||
|  | TCAM bits: 122496 | ||||||
|  | RAM bits:  46662 | ||||||
|  | Processing compressed_tree_d15_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d15_b0_priority.json | ||||||
|  | TCAM bits: 118792 | ||||||
|  | RAM bits:  46662 | ||||||
|  | Processing compressed_tree_d15_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d15_b1_naive.json | ||||||
|  | TCAM bits: 118640 | ||||||
|  | RAM bits:  41349 | ||||||
|  | Processing compressed_tree_d15_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d15_b1_priority.json | ||||||
|  | TCAM bits: 115984 | ||||||
|  | RAM bits:  41349 | ||||||
|  | Processing compressed_tree_d15_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d15_b3_naive.json | ||||||
|  | TCAM bits: 68928 | ||||||
|  | RAM bits:  28875 | ||||||
|  | Processing compressed_tree_d15_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d15_b3_priority.json | ||||||
|  | TCAM bits: 67328 | ||||||
|  | RAM bits:  28875 | ||||||
|  | Processing compressed_tree_d1_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d1_b0_naive.json | ||||||
|  | TCAM bits: 256 | ||||||
|  | RAM bits:  42 | ||||||
|  | Processing compressed_tree_d1_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d1_b0_priority.json | ||||||
|  | TCAM bits: 128 | ||||||
|  | RAM bits:  42 | ||||||
|  | Processing compressed_tree_d1_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d1_b1_naive.json | ||||||
|  | TCAM bits: 256 | ||||||
|  | RAM bits:  42 | ||||||
|  | Processing compressed_tree_d1_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d1_b1_priority.json | ||||||
|  | TCAM bits: 144 | ||||||
|  | RAM bits:  42 | ||||||
|  | Processing compressed_tree_d1_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d1_b3_naive.json | ||||||
|  | TCAM bits: 240 | ||||||
|  | RAM bits:  42 | ||||||
|  | Processing compressed_tree_d1_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d1_b3_priority.json | ||||||
|  | TCAM bits: 128 | ||||||
|  | RAM bits:  42 | ||||||
|  | Processing compressed_tree_d2_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d2_b0_naive.json | ||||||
|  | TCAM bits: 592 | ||||||
|  | RAM bits:  105 | ||||||
|  | Processing compressed_tree_d2_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d2_b0_priority.json | ||||||
|  | TCAM bits: 288 | ||||||
|  | RAM bits:  105 | ||||||
|  | Processing compressed_tree_d2_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d2_b1_naive.json | ||||||
|  | TCAM bits: 592 | ||||||
|  | RAM bits:  105 | ||||||
|  | Processing compressed_tree_d2_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d2_b1_priority.json | ||||||
|  | TCAM bits: 320 | ||||||
|  | RAM bits:  105 | ||||||
|  | Processing compressed_tree_d2_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d2_b3_naive.json | ||||||
|  | TCAM bits: 544 | ||||||
|  | RAM bits:  105 | ||||||
|  | Processing compressed_tree_d2_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d2_b3_priority.json | ||||||
|  | TCAM bits: 288 | ||||||
|  | RAM bits:  105 | ||||||
|  | Processing compressed_tree_d3_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d3_b0_naive.json | ||||||
|  | TCAM bits: 1120 | ||||||
|  | RAM bits:  210 | ||||||
|  | Processing compressed_tree_d3_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d3_b0_priority.json | ||||||
|  | TCAM bits: 640 | ||||||
|  | RAM bits:  210 | ||||||
|  | Processing compressed_tree_d3_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d3_b1_naive.json | ||||||
|  | TCAM bits: 1120 | ||||||
|  | RAM bits:  210 | ||||||
|  | Processing compressed_tree_d3_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d3_b1_priority.json | ||||||
|  | TCAM bits: 680 | ||||||
|  | RAM bits:  210 | ||||||
|  | Processing compressed_tree_d3_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d3_b3_naive.json | ||||||
|  | TCAM bits: 944 | ||||||
|  | RAM bits:  210 | ||||||
|  | Processing compressed_tree_d3_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d3_b3_priority.json | ||||||
|  | TCAM bits: 576 | ||||||
|  | RAM bits:  210 | ||||||
|  | Processing compressed_tree_d4_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d4_b0_naive.json | ||||||
|  | TCAM bits: 1880 | ||||||
|  | RAM bits:  357 | ||||||
|  | Processing compressed_tree_d4_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d4_b0_priority.json | ||||||
|  | TCAM bits: 1128 | ||||||
|  | RAM bits:  357 | ||||||
|  | Processing compressed_tree_d4_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d4_b1_naive.json | ||||||
|  | TCAM bits: 1880 | ||||||
|  | RAM bits:  357 | ||||||
|  | Processing compressed_tree_d4_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d4_b1_priority.json | ||||||
|  | TCAM bits: 1208 | ||||||
|  | RAM bits:  357 | ||||||
|  | Processing compressed_tree_d4_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d4_b3_naive.json | ||||||
|  | TCAM bits: 1632 | ||||||
|  | RAM bits:  336 | ||||||
|  | Processing compressed_tree_d4_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d4_b3_priority.json | ||||||
|  | TCAM bits: 1024 | ||||||
|  | RAM bits:  336 | ||||||
|  | Processing compressed_tree_d5_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d5_b0_naive.json | ||||||
|  | TCAM bits: 3608 | ||||||
|  | RAM bits:  609 | ||||||
|  | Processing compressed_tree_d5_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d5_b0_priority.json | ||||||
|  | TCAM bits: 2200 | ||||||
|  | RAM bits:  609 | ||||||
|  | Processing compressed_tree_d5_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d5_b1_naive.json | ||||||
|  | TCAM bits: 3608 | ||||||
|  | RAM bits:  609 | ||||||
|  | Processing compressed_tree_d5_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d5_b1_priority.json | ||||||
|  | TCAM bits: 2376 | ||||||
|  | RAM bits:  609 | ||||||
|  | Processing compressed_tree_d5_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d5_b3_naive.json | ||||||
|  | TCAM bits: 2704 | ||||||
|  | RAM bits:  546 | ||||||
|  | Processing compressed_tree_d5_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d5_b3_priority.json | ||||||
|  | TCAM bits: 1824 | ||||||
|  | RAM bits:  546 | ||||||
|  | Processing compressed_tree_d6_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d6_b0_naive.json | ||||||
|  | TCAM bits: 6440 | ||||||
|  | RAM bits:  1134 | ||||||
|  | Processing compressed_tree_d6_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d6_b0_priority.json | ||||||
|  | TCAM bits: 4512 | ||||||
|  | RAM bits:  1134 | ||||||
|  | Processing compressed_tree_d6_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d6_b1_naive.json | ||||||
|  | TCAM bits: 6440 | ||||||
|  | RAM bits:  1134 | ||||||
|  | Processing compressed_tree_d6_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d6_b1_priority.json | ||||||
|  | TCAM bits: 4776 | ||||||
|  | RAM bits:  1134 | ||||||
|  | Processing compressed_tree_d6_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d6_b3_naive.json | ||||||
|  | TCAM bits: 4832 | ||||||
|  | RAM bits:  1008 | ||||||
|  | Processing compressed_tree_d6_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d6_b3_priority.json | ||||||
|  | TCAM bits: 3648 | ||||||
|  | RAM bits:  1008 | ||||||
|  | Processing compressed_tree_d7_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d7_b0_naive.json | ||||||
|  | TCAM bits: 10344 | ||||||
|  | RAM bits:  1848 | ||||||
|  | Processing compressed_tree_d7_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d7_b0_priority.json | ||||||
|  | TCAM bits: 7808 | ||||||
|  | RAM bits:  1848 | ||||||
|  | Processing compressed_tree_d7_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d7_b1_naive.json | ||||||
|  | TCAM bits: 10312 | ||||||
|  | RAM bits:  1806 | ||||||
|  | Processing compressed_tree_d7_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d7_b1_priority.json | ||||||
|  | TCAM bits: 8136 | ||||||
|  | RAM bits:  1806 | ||||||
|  | Processing compressed_tree_d7_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d7_b3_naive.json | ||||||
|  | TCAM bits: 7760 | ||||||
|  | RAM bits:  1596 | ||||||
|  | Processing compressed_tree_d7_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d7_b3_priority.json | ||||||
|  | TCAM bits: 6352 | ||||||
|  | RAM bits:  1596 | ||||||
|  | Processing compressed_tree_d8_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d8_b0_naive.json | ||||||
|  | TCAM bits: 15672 | ||||||
|  | RAM bits:  3003 | ||||||
|  | Processing compressed_tree_d8_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d8_b0_priority.json | ||||||
|  | TCAM bits: 12640 | ||||||
|  | RAM bits:  3003 | ||||||
|  | Processing compressed_tree_d8_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d8_b1_naive.json | ||||||
|  | TCAM bits: 15576 | ||||||
|  | RAM bits:  2919 | ||||||
|  | Processing compressed_tree_d8_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d8_b1_priority.json | ||||||
|  | TCAM bits: 13160 | ||||||
|  | RAM bits:  2919 | ||||||
|  | Processing compressed_tree_d8_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d8_b3_naive.json | ||||||
|  | TCAM bits: 11504 | ||||||
|  | RAM bits:  2625 | ||||||
|  | Processing compressed_tree_d8_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d8_b3_priority.json | ||||||
|  | TCAM bits: 10016 | ||||||
|  | RAM bits:  2625 | ||||||
|  | Processing compressed_tree_d9_b0.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d9_b0_naive.json | ||||||
|  | TCAM bits: 22640 | ||||||
|  | RAM bits:  4662 | ||||||
|  | Processing compressed_tree_d9_b0.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d9_b0_priority.json | ||||||
|  | TCAM bits: 18936 | ||||||
|  | RAM bits:  4662 | ||||||
|  | Processing compressed_tree_d9_b1.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d9_b1_naive.json | ||||||
|  | TCAM bits: 22784 | ||||||
|  | RAM bits:  4557 | ||||||
|  | Processing compressed_tree_d9_b1.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d9_b1_priority.json | ||||||
|  | TCAM bits: 19872 | ||||||
|  | RAM bits:  4557 | ||||||
|  | Processing compressed_tree_d9_b3.json with mode naive | ||||||
|  | Output written to results\rmt\compressed_tree_d9_b3_naive.json | ||||||
|  | TCAM bits: 16560 | ||||||
|  | RAM bits:  3948 | ||||||
|  | Processing compressed_tree_d9_b3.json with mode priority | ||||||
|  | Output written to results\rmt\compressed_tree_d9_b3_priority.json | ||||||
|  | TCAM bits: 14880 | ||||||
|  | RAM bits:  3948 | ||||||
|  | All runs complete. | ||||||
|  | Press any key to continue . . .  | ||||||
							
								
								
									
										24
									
								
								run/run.bat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								run/run.bat
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | |||||||
|  | @echo off | ||||||
|  | REM --- settings -------------------------------------------------------- | ||||||
|  | set INPUT=..\data\combined\data.csv | ||||||
|  | set OUTDIR=results\tree | ||||||
|  | set DEPTH_LIST=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | ||||||
|  | set BITS_LIST=0 1 3 | ||||||
|  | set PY=python | ||||||
|  | REM --------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | if not exist "%OUTDIR%" mkdir "%OUTDIR%" | ||||||
|  |  | ||||||
|  | for %%D in (%DEPTH_LIST%) do ( | ||||||
|  |     for %%B in (%BITS_LIST%) do ( | ||||||
|  |         echo Running depth=%%D bits=%%B | ||||||
|  |         %PY% decision_tree.py ^ | ||||||
|  |             --input "%INPUT%" ^ | ||||||
|  |             --output "%OUTDIR%\tree_d%%D_b%%B.json" ^ | ||||||
|  |             --depth %%D ^ | ||||||
|  |             --nudge --bits %%B | ||||||
|  |     ) | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | echo All runs complete | ||||||
|  | pause | ||||||
							
								
								
									
										272
									
								
								run/run.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										272
									
								
								run/run.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,272 @@ | |||||||
|  | Running depth=1 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6249802762830571 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6249802762830571 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b0.json | ||||||
|  | Running depth=1 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6249802762830571 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6249802762830571 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b1.json | ||||||
|  | Running depth=1 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6249802762830571 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6249802762830571 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b3.json | ||||||
|  | Running depth=2 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6329657127591488 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6329657127591488 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b0.json | ||||||
|  | Running depth=2 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6329657127591488 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.632965582569598 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b1.json | ||||||
|  | Running depth=2 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6329657127591488 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.632991490290203 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b3.json | ||||||
|  | Running depth=3 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6770542739406867 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6770542739406867 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b0.json | ||||||
|  | Running depth=3 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6770542739406867 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6770412549856089 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b1.json | ||||||
|  | Running depth=3 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.6770542739406867 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.6785083610333301 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b3.json | ||||||
|  | Running depth=4 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.7785798611346175 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7785798611346175 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b0.json | ||||||
|  | Running depth=4 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.7785798611346175 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7762147075656273 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b1.json | ||||||
|  | Running depth=4 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.7785798611346175 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7764365505601536 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b3.json | ||||||
|  | Running depth=5 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8410252791654538 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8410252791654538 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b0.json | ||||||
|  | Running depth=5 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8410252791654538 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.834092425207405 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b1.json | ||||||
|  | Running depth=5 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8410252791654538 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.772544924508287 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b3.json | ||||||
|  | Running depth=6 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8646269522574087 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8646269522574087 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b0.json | ||||||
|  | Running depth=6 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8646269522574087 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8576925360247506 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b1.json | ||||||
|  | Running depth=6 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8646269522574087 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.794651761178205 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b3.json | ||||||
|  | Running depth=7 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8806056365826389 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8806056365826389 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b0.json | ||||||
|  | Running depth=7 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8806056365826389 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8736095105029118 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b1.json | ||||||
|  | Running depth=7 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8806056365826389 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7695685309983924 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b3.json | ||||||
|  | Running depth=8 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8930218140403702 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8930218140403702 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b0.json | ||||||
|  | Running depth=8 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8930218140403702 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8853817704424934 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b1.json | ||||||
|  | Running depth=8 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.8930218140403702 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7773965683075931 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b3.json | ||||||
|  | Running depth=9 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9065990219119429 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9065990219119429 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b0.json | ||||||
|  | Running depth=9 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9065990219119429 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.8971600191014109 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b1.json | ||||||
|  | Running depth=9 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9065990219119429 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7901483744272311 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b3.json | ||||||
|  | Running depth=10 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9131070673658019 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9131070673658019 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b0.json | ||||||
|  | Running depth=10 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9131070673658019 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9012124292484887 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b1.json | ||||||
|  | Running depth=10 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9131070673658019 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7823837394292594 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b3.json | ||||||
|  | Running depth=11 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9167131877328115 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9167131877328115 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b0.json | ||||||
|  | Running depth=11 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9167131877328115 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9033505322409215 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b1.json | ||||||
|  | Running depth=11 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9167131877328115 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7834850128392935 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b3.json | ||||||
|  | Running depth=12 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9190772997853955 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9190772997853955 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b0.json | ||||||
|  | Running depth=12 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9190772997853955 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9050692946902973 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b1.json | ||||||
|  | Running depth=12 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9190772997853955 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7733082258445005 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b3.json | ||||||
|  | Running depth=13 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9210431620021486 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9210431620021486 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b0.json | ||||||
|  | Running depth=13 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9210431620021486 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9069113466442602 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b1.json | ||||||
|  | Running depth=13 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9210431620021486 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7656775558942799 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b3.json | ||||||
|  | Running depth=14 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9232170671210456 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9232170671210456 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b0.json | ||||||
|  | Running depth=14 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9232169369314948 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9071005120615411 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b1.json | ||||||
|  | Running depth=14 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9232170671210456 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.7649352150757417 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b3.json | ||||||
|  | Running depth=15 bits=0 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9249752770043072 | ||||||
|  | nudging enabled, removed bottom 0 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.9249752770043072 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b0.json | ||||||
|  | Running depth=15 bits=1 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9249752770043072 | ||||||
|  | nudging enabled, removed bottom 1 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.908089692268355 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b1.json | ||||||
|  | Running depth=15 bits=3 | ||||||
|  | dataset size: 7681108 | ||||||
|  | train accuracy (before nudging): 0.9249752770043072 | ||||||
|  | nudging enabled, removed bottom 3 bit(s) per threshold | ||||||
|  | train accuracy (after  nudging): 0.762985496363285 | ||||||
|  | Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b3.json | ||||||
|  | All runs complete | ||||||
|  | Press any key to continue . . .  | ||||||
							
								
								
									
										173
									
								
								run/tree_compress.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										173
									
								
								run/tree_compress.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,173 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | """Batch‑compress decision‑tree JSON files. | ||||||
|  |  | ||||||
|  | This script preserves the original logic but loops over every *.json file | ||||||
|  | in results/tree and drops a corresponding compressed file in | ||||||
|  | results/compressed_tree. | ||||||
|  |  | ||||||
|  | Example: | ||||||
|  |     $ python compress_trees_batch.py | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | from __future__ import annotations | ||||||
|  |  | ||||||
|  | import json | ||||||
|  | import math | ||||||
|  | import os | ||||||
|  | from collections import defaultdict | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | INPUT_DIR = Path("results/tree") | ||||||
|  | OUTPUT_DIR = Path("results/compressed_tree") | ||||||
|  | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class SetEncoder(json.JSONEncoder): | ||||||
|  |     def default(self, obj):  # type: ignore[override] | ||||||
|  |         if isinstance(obj, set): | ||||||
|  |             return list(obj) | ||||||
|  |         return super().default(obj) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # helper function given a range and value x returns if x is in the range | ||||||
|  |  | ||||||
|  | def is_in_range(x: int, lower: int | None, upper: int | None) -> bool:  # noqa: N803 | ||||||
|  |     if lower is None and upper is None: | ||||||
|  |         return True | ||||||
|  |     if lower is None: | ||||||
|  |         return x <= upper  # type: ignore[operator] | ||||||
|  |     if upper is None: | ||||||
|  |         return x > lower | ||||||
|  |     return x <= upper and x > lower  # type: ignore[operator] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for tree_path in INPUT_DIR.glob("*.json"): | ||||||
|  |     with tree_path.open() as f: | ||||||
|  |         tree = json.load(f) | ||||||
|  |  | ||||||
|  |     paths = tree["paths"] | ||||||
|  |  | ||||||
|  |     # First cleanup the tree by rounding the decision points to integer values | ||||||
|  |     path_ids: set[int] = set() | ||||||
|  |     path_classes = tree["classes"] | ||||||
|  |  | ||||||
|  |     # assign ids and round thresholds | ||||||
|  |     for idx, path in enumerate(paths): | ||||||
|  |         path["id"] = idx | ||||||
|  |         path_ids.add(idx) | ||||||
|  |         for condition in path["conditions"]: | ||||||
|  |             operation = condition["operation"] | ||||||
|  |             if operation == "<=": | ||||||
|  |                 condition["value"] = math.floor(condition["value"]) | ||||||
|  |             else: | ||||||
|  |                 condition["value"] = math.floor(condition["value"]) | ||||||
|  |  | ||||||
|  |     # Find all breakpoints for each feature and create a set of disjoint ranges | ||||||
|  |     breakpoints: dict[str, list[int]] = defaultdict(set)  # type: ignore[assignment] | ||||||
|  |     for path in paths: | ||||||
|  |         for condition in path["conditions"]: | ||||||
|  |             feature = condition["feature"] | ||||||
|  |             value = condition["value"] | ||||||
|  |             breakpoints[feature].add(value) | ||||||
|  |  | ||||||
|  |     # sort breakpoint lists | ||||||
|  |     for feature in breakpoints: | ||||||
|  |         points = list(breakpoints[feature]) | ||||||
|  |         points.sort() | ||||||
|  |         breakpoints[feature] = points  # type: ignore[assignment] | ||||||
|  |  | ||||||
|  |     # collapse all paths to ranges for each feature | ||||||
|  |     for path in paths: | ||||||
|  |         compressed: dict[str, dict[str, int | None]] = {} | ||||||
|  |         for feature in breakpoints: | ||||||
|  |             compressed[feature] = {"min": None, "max": None} | ||||||
|  |  | ||||||
|  |         for condition in path["conditions"]: | ||||||
|  |             feature = condition["feature"] | ||||||
|  |             operation = condition["operation"] | ||||||
|  |             value = condition["value"] | ||||||
|  |             if operation == "<=" and compressed[feature]["max"] is None: | ||||||
|  |                 compressed[feature]["max"] = value | ||||||
|  |             elif operation == ">" and compressed[feature]["min"] is None: | ||||||
|  |                 compressed[feature]["min"] = value | ||||||
|  |             elif operation == "<=" and value < compressed[feature]["max"]:  # type: ignore[operator] | ||||||
|  |                 compressed[feature]["max"] = value | ||||||
|  |             elif operation == ">" and value > compressed[feature]["min"]:  # type: ignore[operator] | ||||||
|  |                 compressed[feature]["min"] = value | ||||||
|  |  | ||||||
|  |         path["compressed"] = compressed | ||||||
|  |  | ||||||
|  |     # create buckets for each feature, where each is a list of sets | ||||||
|  |     buckets_id: dict[str, list[set[int]]] = {} | ||||||
|  |     buckets_class: dict[str, list[set[str]]] = {} | ||||||
|  |     for feature in breakpoints: | ||||||
|  |         num_points = len(breakpoints[feature]) | ||||||
|  |         buckets_id[feature] = [set() for _ in range(num_points + 1)] | ||||||
|  |         buckets_class[feature] = [set() for _ in range(num_points + 1)] | ||||||
|  |  | ||||||
|  |     # fill buckets | ||||||
|  |     for path in paths: | ||||||
|  |         for feature_name, feature in path["compressed"].items(): | ||||||
|  |             lower = feature["min"] | ||||||
|  |             upper = feature["max"] | ||||||
|  |             pid = path["id"] | ||||||
|  |             cls = path["classification"] | ||||||
|  |  | ||||||
|  |             for idx, bp in enumerate(breakpoints[feature_name]): | ||||||
|  |                 if is_in_range(bp, lower, upper): | ||||||
|  |                     buckets_id[feature_name][idx].add(pid) | ||||||
|  |                     buckets_class[feature_name][idx].add(cls) | ||||||
|  |             # last bucket (> last breakpoint) | ||||||
|  |             if is_in_range(bp + 1, lower, upper): | ||||||
|  |                 buckets_id[feature_name][-1].add(pid) | ||||||
|  |                 buckets_class[feature_name][-1].add(cls) | ||||||
|  |  | ||||||
|  |     # combine breakpoints and buckets to one representation | ||||||
|  |     compressed_layers: dict[str, list[dict[str, object]]] = defaultdict(list) | ||||||
|  |     for feature_name in buckets_id: | ||||||
|  |         lower = None | ||||||
|  |         upper = breakpoints[feature_name][0] | ||||||
|  |         compressed_layers[feature_name].append( | ||||||
|  |             { | ||||||
|  |                 "min": lower, | ||||||
|  |                 "max": upper, | ||||||
|  |                 "paths": buckets_id[feature_name][0], | ||||||
|  |                 "classes": buckets_class[feature_name][0], | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  |         for i in range(1, len(buckets_id[feature_name]) - 1): | ||||||
|  |             lower = breakpoints[feature_name][i - 1] | ||||||
|  |             upper = breakpoints[feature_name][i] | ||||||
|  |             compressed_layers[feature_name].append( | ||||||
|  |                 { | ||||||
|  |                     "min": lower, | ||||||
|  |                     "max": upper, | ||||||
|  |                     "paths": buckets_id[feature_name][i], | ||||||
|  |                     "classes": buckets_class[feature_name][i], | ||||||
|  |                 } | ||||||
|  |             ) | ||||||
|  |         lower = breakpoints[feature_name][-1] | ||||||
|  |         upper = None | ||||||
|  |         compressed_layers[feature_name].append( | ||||||
|  |             { | ||||||
|  |                 "min": lower, | ||||||
|  |                 "max": upper, | ||||||
|  |                 "paths": buckets_id[feature_name][-1], | ||||||
|  |                 "classes": buckets_class[feature_name][-1], | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     path_to_class = {path["id"]: path["classification"] for path in paths} | ||||||
|  |  | ||||||
|  |     compressed_tree = { | ||||||
|  |         "paths": list(path_ids), | ||||||
|  |         "classes": path_classes, | ||||||
|  |         "layers": compressed_layers, | ||||||
|  |         "path_to_class": path_to_class, | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     out_path = OUTPUT_DIR / tree_path.name.replace("tree", "compressed_tree") | ||||||
|  |     with out_path.open("w") as f_out: | ||||||
|  |         json.dump(compressed_tree, f_out, indent=4, cls=SetEncoder) | ||||||
|  |  | ||||||
|  |     # print(f"Wrote {out_path.relative_to(Path.cwd())}") | ||||||
							
								
								
									
										279
									
								
								run/tree_to_rmt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										279
									
								
								run/tree_to_rmt.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,279 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | """Range‑to‑Prefix evaluation tool | ||||||
|  |  | ||||||
|  | This script keeps the original logic intact while letting you choose | ||||||
|  | which expansion strategy to run via a command‑line flag. | ||||||
|  |  | ||||||
|  | Example: | ||||||
|  |     $ python rmt_selectable.py --mode naive | ||||||
|  |     $ python rmt_selectable.py --mode priority --input mytree.json --output result.json | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | import argparse | ||||||
|  | import json | ||||||
|  | import math | ||||||
|  | import sys | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  | # Static configuration | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  | field_width = { | ||||||
|  |     "src": 16, | ||||||
|  |     "dst": 16, | ||||||
|  |     "protocol": 8, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  | # Helper routines (unchanged) | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | def int_to_bin(i, width): | ||||||
|  |     return bin(i)[2:].zfill(width) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def increment_dc(pfx): | ||||||
|  |     idx = pfx.find("*") | ||||||
|  |     if idx == -1: | ||||||
|  |         idx = len(pfx) | ||||||
|  |     idx -= 1 | ||||||
|  |     return pfx[:idx] + "*" + pfx[idx + 1 :] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def can_merge(pfx_a, pfx_b): | ||||||
|  |     pfx_a = pfx_a.replace("*", "") | ||||||
|  |     pfx_b = pfx_b.replace("*", "") | ||||||
|  |     return pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def merge(pfx_a, prefixes): | ||||||
|  |     pfx_a = increment_dc(pfx_a) | ||||||
|  |     prefixes[-1] = pfx_a | ||||||
|  |  | ||||||
|  |     for i in range(len(prefixes) - 2, -1, -1): | ||||||
|  |         if can_merge(prefixes[i], prefixes[i + 1]): | ||||||
|  |             prefixes.pop() | ||||||
|  |             pfx = increment_dc(prefixes[i]) | ||||||
|  |             prefixes[i] = pfx | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def convert_range(lower, upper, width): | ||||||
|  |     prefixes = [] | ||||||
|  |     prefix = int_to_bin(lower, width) | ||||||
|  |     prefixes.append(prefix) | ||||||
|  |     norm_upper = min(upper, 2 ** width - 1) | ||||||
|  |     for i in range(lower + 1, norm_upper + 1): | ||||||
|  |         prefix = int_to_bin(i, width) | ||||||
|  |         if can_merge(prefix, prefixes[-1]): | ||||||
|  |             merge(prefix, prefixes) | ||||||
|  |         else: | ||||||
|  |             prefixes.append(prefix) | ||||||
|  |     return prefixes | ||||||
|  |  | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  | # RMT construction strategies (logic preserved) | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | def worst_case_rmt(tree): | ||||||
|  |     rmt = [] | ||||||
|  |     step = 0 | ||||||
|  |  | ||||||
|  |     tcam_bits = 0 | ||||||
|  |     ram_bits = 0 | ||||||
|  |  | ||||||
|  |     for layer in layers: | ||||||
|  |         num_ranges = len(layers[layer]) | ||||||
|  |         # assume that each range requires all of 2*k prefixes when performing prefix expansion | ||||||
|  |         # therefore there are 2*k * R for R ranges and width k | ||||||
|  |         num_prefixes = 2 * field_width[layer] * num_ranges | ||||||
|  |         prefix_width = field_width[layer] | ||||||
|  |  | ||||||
|  |         tcam = { | ||||||
|  |             "id": f"{layer}_range", | ||||||
|  |             "step": step, | ||||||
|  |             "match": "ternary", | ||||||
|  |             "entries": num_prefixes, | ||||||
|  |             "key_size": prefix_width, | ||||||
|  |         } | ||||||
|  |         tcam_bits += num_prefixes * prefix_width | ||||||
|  |  | ||||||
|  |         # assume basic pointer reuse for metadata storage | ||||||
|  |         ram = { | ||||||
|  |             "id": f"{layer}_meta", | ||||||
|  |             "step": step, | ||||||
|  |             "match": "exact", | ||||||
|  |             "method": "index", | ||||||
|  |             "key_size": math.ceil(math.log2(num_ranges)), | ||||||
|  |             "data_size": len(classes), | ||||||
|  |         } | ||||||
|  |         ram_bits += num_ranges * len(classes) | ||||||
|  |  | ||||||
|  |         rmt.append(tcam) | ||||||
|  |         rmt.append(ram) | ||||||
|  |  | ||||||
|  |         step += 1 | ||||||
|  |  | ||||||
|  |     return rmt, tcam_bits, ram_bits | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def naive_rmt(tree): | ||||||
|  |     rmt = [] | ||||||
|  |     step = 0 | ||||||
|  |  | ||||||
|  |     tcam_bits = 0 | ||||||
|  |     ram_bits = 0 | ||||||
|  |  | ||||||
|  |     for layer in layers: | ||||||
|  |         num_prefixes = 0 | ||||||
|  |         prefix_width = field_width[layer] | ||||||
|  |         # for each range in the layer, convert the ranges to prefixes using naive range expansion | ||||||
|  |         for r in layers[layer]: | ||||||
|  |             if r["min"] is None: | ||||||
|  |                 r["min"] = 0 | ||||||
|  |             elif r["max"] is None: | ||||||
|  |                 r["max"] = 2 ** prefix_width | ||||||
|  |             prefixes = convert_range(r["min"], r["max"], prefix_width) | ||||||
|  |             r["prefixes"] = prefixes | ||||||
|  |             num_prefixes += len(prefixes) | ||||||
|  |             tcam_bits += len(prefixes) * prefix_width | ||||||
|  |  | ||||||
|  |         tcam = { | ||||||
|  |             "id": f"{layer}_range", | ||||||
|  |             "step": step, | ||||||
|  |             "match": "ternary", | ||||||
|  |             "entries": num_prefixes, | ||||||
|  |             "key_size": prefix_width, | ||||||
|  |             "ranges": layers[layer], | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         num_ranges = len(layers[layer]) | ||||||
|  |         # assume no pointer reuse for metadata storage | ||||||
|  |         ram = { | ||||||
|  |             "id": f"{layer}_meta", | ||||||
|  |             "step": step, | ||||||
|  |             "match": "exact", | ||||||
|  |             "method": "index", | ||||||
|  |             "key_size": math.ceil(math.log2(num_ranges)), | ||||||
|  |             "data_size": len(classes), | ||||||
|  |         } | ||||||
|  |         ram_bits += num_ranges * len(classes) | ||||||
|  |  | ||||||
|  |         rmt.append(tcam) | ||||||
|  |         rmt.append(ram) | ||||||
|  |  | ||||||
|  |         step += 1 | ||||||
|  |  | ||||||
|  |     return rmt, tcam_bits, ram_bits | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def priority_aware(tree): | ||||||
|  |     rmt = [] | ||||||
|  |     step = 0 | ||||||
|  |  | ||||||
|  |     tcam_bits = 0 | ||||||
|  |     ram_bits = 0 | ||||||
|  |  | ||||||
|  |     for layer in layers: | ||||||
|  |         num_prefixes = 0 | ||||||
|  |         prefix_width = field_width[layer] | ||||||
|  |         # for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0 | ||||||
|  |         # then check which set of prefixes would be better | ||||||
|  |         # we will assume the ranges are already disjoint and in the correct order | ||||||
|  |         for r in layers[layer]: | ||||||
|  |             if r["min"] is None: | ||||||
|  |                 r["min"] = 0 | ||||||
|  |             elif r["max"] is None: | ||||||
|  |                 r["max"] = 2 ** prefix_width | ||||||
|  |             regular_prefixes = convert_range(r["min"], r["max"], prefix_width) | ||||||
|  |             zero_start_prefixes = convert_range(0, r["max"], prefix_width) | ||||||
|  |  | ||||||
|  |             if len(regular_prefixes) <= len(zero_start_prefixes): | ||||||
|  |                 pfx_type = "exact" | ||||||
|  |                 prefixes = regular_prefixes | ||||||
|  |             else: | ||||||
|  |                 pfx_type = "zero" | ||||||
|  |                 prefixes = zero_start_prefixes | ||||||
|  |  | ||||||
|  |             r["prefixes"] = prefixes | ||||||
|  |             r["prefix_type"] = pfx_type | ||||||
|  |             num_prefixes += len(prefixes) | ||||||
|  |             tcam_bits += len(prefixes) * prefix_width | ||||||
|  |  | ||||||
|  |         tcam = { | ||||||
|  |             "id": f"{layer}_range", | ||||||
|  |             "step": step, | ||||||
|  |             "match": "ternary", | ||||||
|  |             "entries": num_prefixes, | ||||||
|  |             "key_size": prefix_width, | ||||||
|  |             "ranges": layers[layer], | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         num_ranges = len(layers[layer]) | ||||||
|  |         # assume no pointer reuse for metadata storage | ||||||
|  |         ram = { | ||||||
|  |             "id": f"{layer}_meta", | ||||||
|  |             "step": step, | ||||||
|  |             "match": "exact", | ||||||
|  |             "method": "index", | ||||||
|  |             "key_size": math.ceil(math.log2(num_ranges)), | ||||||
|  |             "data_size": len(classes), | ||||||
|  |         } | ||||||
|  |         ram_bits += num_ranges * len(classes) | ||||||
|  |  | ||||||
|  |         rmt.append(tcam) | ||||||
|  |         rmt.append(ram) | ||||||
|  |  | ||||||
|  |         step += 1 | ||||||
|  |  | ||||||
|  |     return rmt, tcam_bits, ram_bits | ||||||
|  |  | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  | # Main entry point | ||||||
|  | # --------------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | def parse_args() -> argparse.Namespace: | ||||||
|  |     parser = argparse.ArgumentParser(description="Evaluate RMT memory usage for different range‑to‑prefix strategies.") | ||||||
|  |     parser.add_argument("--mode", choices=["worst", "naive", "priority"], default="worst", help="Strategy to use") | ||||||
|  |     parser.add_argument("--input", default="compressed_tree.json", help="Input tree JSON file") | ||||||
|  |     parser.add_argument("--output", default=None, help="Output RMT JSON file (defaults to <mode>_rmt.json)") | ||||||
|  |     return parser.parse_args() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main() -> None: | ||||||
|  |     args = parse_args() | ||||||
|  |  | ||||||
|  |     # Keep the original variable names so the functions stay unchanged | ||||||
|  |     global layers, classes | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         with open(args.input) as f: | ||||||
|  |             tree = json.load(f) | ||||||
|  |     except FileNotFoundError: | ||||||
|  |         sys.exit(f"Input file '{args.input}' not found.") | ||||||
|  |  | ||||||
|  |     layers = tree["layers"] | ||||||
|  |     classes = tree["classes"] | ||||||
|  |  | ||||||
|  |     if args.mode == "worst": | ||||||
|  |         rmt, tcam_bits, ram_bits = worst_case_rmt(tree) | ||||||
|  |         default_out = "worst_case_rmt.json" | ||||||
|  |     elif args.mode == "naive": | ||||||
|  |         rmt, tcam_bits, ram_bits = naive_rmt(tree) | ||||||
|  |         default_out = "naive_rmt.json" | ||||||
|  |     else:  # priority | ||||||
|  |         rmt, tcam_bits, ram_bits = priority_aware(tree) | ||||||
|  |         default_out = "priority_aware.json" | ||||||
|  |  | ||||||
|  |     out_file = args.output or default_out | ||||||
|  |  | ||||||
|  |     with open(out_file, "w") as f: | ||||||
|  |         json.dump(rmt, f, indent=4) | ||||||
|  |  | ||||||
|  |     #! command python3 ideal-rmt-simulator/sim.py {out_file} | ||||||
|  |     print(f"Output written to {out_file}") | ||||||
|  |     print(f"TCAM bits: {tcam_bits}") | ||||||
|  |     print(f"RAM bits:  {ram_bits}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										44
									
								
								sanity_check/csvdiff.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								sanity_check/csvdiff.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | """ | ||||||
|  | csvdiff.py file1.csv file2.csv | ||||||
|  | Streams both files; prints the first differing line or | ||||||
|  | ‘No differences found’. Uses O(1) memory. | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | import sys | ||||||
|  | from itertools import zip_longest | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | def open_checked(p: str): | ||||||
|  |     print(p) | ||||||
|  |     path = Path(p) | ||||||
|  |     try: | ||||||
|  |         return path.open("r", newline=""), path | ||||||
|  |     except FileNotFoundError: | ||||||
|  |         sys.exit(f"Error: {path} not found") | ||||||
|  |  | ||||||
|  | def human(n: int) -> str: | ||||||
|  |     return f"{n:,}" | ||||||
|  |  | ||||||
|  | def main(a_path: str, b_path: str) -> None: | ||||||
|  |     fa, a = open_checked(a_path) | ||||||
|  |     fb, b = open_checked(b_path) | ||||||
|  |  | ||||||
|  |     with fa, fb: | ||||||
|  |         for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1): | ||||||
|  |             if ra != rb: | ||||||
|  |                 print(f"Files differ at line {human(idx)}") | ||||||
|  |                 if ra is None: | ||||||
|  |                     print(f"{a} ended early") | ||||||
|  |                 elif rb is None: | ||||||
|  |                     print(f"{b} ended early") | ||||||
|  |                 else: | ||||||
|  |                     print(f"{a}: {ra.rstrip()}") | ||||||
|  |                     print(f"{b}: {rb.rstrip()}") | ||||||
|  |                 return | ||||||
|  |     print("No differences found") | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     if len(sys.argv) != 3: | ||||||
|  |         sys.exit("Usage: csvdiff.py file1.csv file2.csv") | ||||||
|  |     main(sys.argv[1], sys.argv[2]) | ||||||
							
								
								
									
										600
									
								
								sanity_check/data_visualization.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										600
									
								
								sanity_check/data_visualization.ipynb
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										206
									
								
								sanity_check/diversity_metrics.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										206
									
								
								sanity_check/diversity_metrics.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,206 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  | """diversity_metrics.py (fast version) | ||||||
|  |  | ||||||
|  | Estimate how much diversity each CSV adds without building a giant in‑memory | ||||||
|  | DataFrame.  Designed for IoT packet logs with millions of rows. | ||||||
|  |  | ||||||
|  | Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back | ||||||
|  | to pandas plain text). | ||||||
|  |  | ||||||
|  | Usage | ||||||
|  | ----- | ||||||
|  | python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000] | ||||||
|  |  | ||||||
|  | Metrics | ||||||
|  | ------- | ||||||
|  | ΔEntropy  : change in Shannon entropy of *classification* counts | ||||||
|  | ΔGini     : change in Gini impurity of the same counts | ||||||
|  | χ² p      : Pearson χ² p‑value old vs new classification counts | ||||||
|  | Jaccard   : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new) | ||||||
|  | KS src p  : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling) | ||||||
|  | KS dst p  : Kolmogorov–Smirnov p‑value, dest‑port  dist (uses sampling) | ||||||
|  |  | ||||||
|  | Speed tricks | ||||||
|  | ------------ | ||||||
|  | * No growing DataFrame; we keep Counters / sets / lists. | ||||||
|  | * Ports for KS are *sampled* (default 50 k) to bound cost. | ||||||
|  | * (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead. | ||||||
|  | * pandas reads via **pyarrow** engine when available. | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | import argparse | ||||||
|  | from pathlib import Path | ||||||
|  | from collections import Counter | ||||||
|  | from typing import List, Set | ||||||
|  |  | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | from scipy.stats import chi2_contingency, ks_2samp, entropy | ||||||
|  |  | ||||||
|  | try: | ||||||
|  |     from tabulate import tabulate | ||||||
|  |     _USE_TABULATE = True | ||||||
|  | except ImportError: | ||||||
|  |     _USE_TABULATE = False | ||||||
|  |  | ||||||
|  | # ----------------------------------------------------------------------------- | ||||||
|  | # Helper metrics | ||||||
|  | # ----------------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | def shannon(counts: Counter) -> float: | ||||||
|  |     total = sum(counts.values()) | ||||||
|  |     if total == 0: | ||||||
|  |         return 0.0 | ||||||
|  |     p = np.fromiter(counts.values(), dtype=float) | ||||||
|  |     p /= total | ||||||
|  |     return entropy(p, base=2) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def gini(counts: Counter) -> float: | ||||||
|  |     total = sum(counts.values()) | ||||||
|  |     if total == 0: | ||||||
|  |         return 0.0 | ||||||
|  |     return 1.0 - sum((n / total) ** 2 for n in counts.values()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def jaccard(a: Set[int], b: Set[int]) -> float: | ||||||
|  |     if not a and not b: | ||||||
|  |         return 1.0 | ||||||
|  |     return len(a & b) / len(a | b) | ||||||
|  |  | ||||||
|  | # ----------------------------------------------------------------------------- | ||||||
|  | # Core analysis | ||||||
|  | # ----------------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | def analyse(csv_files: List[Path], sample_size: int): | ||||||
|  |     """Return list of dicts with diversity metrics for each added file.""" | ||||||
|  |  | ||||||
|  |     # cumulative state (no big DataFrame!) | ||||||
|  |     class_counter: Counter = Counter() | ||||||
|  |     pair_hashes: Set[int] = set() | ||||||
|  |     src_list: List[int] = [] | ||||||
|  |     dst_list: List[int] = [] | ||||||
|  |  | ||||||
|  |     rows = [] | ||||||
|  |  | ||||||
|  |     for csv_path in csv_files: | ||||||
|  |         df = pd.read_csv( | ||||||
|  |             csv_path, | ||||||
|  |             engine="pyarrow" if pd.__version__ >= "2" else "c",  # fast parse | ||||||
|  |             usecols=["protocl", "src", "dst", "classfication"], | ||||||
|  |             dtype={ | ||||||
|  |                 "protocl": "uint16", | ||||||
|  |                 "protocol": "uint16", | ||||||
|  |                 "src": "uint16", | ||||||
|  |                 "dst": "uint16", | ||||||
|  |             }, | ||||||
|  |         ) | ||||||
|  |         # normalise column names | ||||||
|  |         df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True) | ||||||
|  |  | ||||||
|  |         # snapshot previous state | ||||||
|  |         prev_class = class_counter.copy() | ||||||
|  |         prev_pairs = pair_hashes.copy() | ||||||
|  |         prev_src = np.asarray(src_list, dtype=np.uint16) | ||||||
|  |         prev_dst = np.asarray(dst_list, dtype=np.uint16) | ||||||
|  |  | ||||||
|  |         # --- update cumulative structures ------------------------------------ | ||||||
|  |         class_counter.update(df["classification"].value_counts().to_dict()) | ||||||
|  |  | ||||||
|  |         # hash (src,dst) into 32‑bit int to save memory | ||||||
|  |         pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \ | ||||||
|  |             df["dst"].to_numpy(dtype=np.uint32) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         # extend port lists (keep small ints) | ||||||
|  |         src_list.extend(df["src"].tolist()) | ||||||
|  |         dst_list.extend(df["dst"].tolist()) | ||||||
|  |  | ||||||
|  |         # --- metrics ---------------------------------------------------------- | ||||||
|  |         # χ² classification | ||||||
|  |         chi_p = np.nan | ||||||
|  |         if prev_class: | ||||||
|  |             all_classes = list(set(prev_class) | set(df["classification"].unique())) | ||||||
|  |             old = [prev_class.get(c, 0) for c in all_classes] | ||||||
|  |             new = [df["classification"].value_counts().get(c, 0) for c in all_classes] | ||||||
|  |             _, chi_p, _, _ = chi2_contingency([old, new]) | ||||||
|  |  | ||||||
|  |         # entropy & gini deltas | ||||||
|  |         delta_entropy = shannon(class_counter) - shannon(prev_class) | ||||||
|  |         delta_gini = gini(class_counter) - gini(prev_class) | ||||||
|  |  | ||||||
|  |         # Jaccard on pair hashes | ||||||
|  |         jc = jaccard(prev_pairs, pair_hashes) | ||||||
|  |  | ||||||
|  |         # KS tests on sampled ports | ||||||
|  |         ks_src_p = ks_dst_p = np.nan | ||||||
|  |         if prev_src.size: | ||||||
|  |             new_src = df["src"].to_numpy(dtype=np.uint16) | ||||||
|  |             new_dst = df["dst"].to_numpy(dtype=np.uint16) | ||||||
|  |             if prev_src.size > sample_size: | ||||||
|  |                 prev_src_sample = np.random.choice(prev_src, sample_size, replace=False) | ||||||
|  |             else: | ||||||
|  |                 prev_src_sample = prev_src | ||||||
|  |             if new_src.size > sample_size: | ||||||
|  |                 new_src_sample = np.random.choice(new_src, sample_size, replace=False) | ||||||
|  |             else: | ||||||
|  |                 new_src_sample = new_src | ||||||
|  |             if prev_dst.size > sample_size: | ||||||
|  |                 prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False) | ||||||
|  |             else: | ||||||
|  |                 prev_dst_sample = prev_dst | ||||||
|  |             if new_dst.size > sample_size: | ||||||
|  |                 new_dst_sample = np.random.choice(new_dst, sample_size, replace=False) | ||||||
|  |             else: | ||||||
|  |                 new_dst_sample = new_dst | ||||||
|  |  | ||||||
|  |             ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue | ||||||
|  |             ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue | ||||||
|  |  | ||||||
|  |         rows.append( | ||||||
|  |             { | ||||||
|  |                 "File": csv_path.name, | ||||||
|  |                 "Rows": len(df), | ||||||
|  |                 "ΔEntropy": round(delta_entropy, 4), | ||||||
|  |                 "ΔGini": round(delta_gini, 4), | ||||||
|  |                 "χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA", | ||||||
|  |                 "Jaccard": round(jc, 3), | ||||||
|  |                 "KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA", | ||||||
|  |                 "KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA", | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  |     return rows | ||||||
|  |  | ||||||
|  | # ----------------------------------------------------------------------------- | ||||||
|  | # CLI | ||||||
|  | # ----------------------------------------------------------------------------- | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).") | ||||||
|  |     ap.add_argument("csv_dir", help="Directory containing CSV files") | ||||||
|  |     ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir") | ||||||
|  |     ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)") | ||||||
|  |     args = ap.parse_args() | ||||||
|  |  | ||||||
|  |     root = Path(args.csv_dir) | ||||||
|  |     pattern = "**/*.csv" if args.recursive else "*.csv" | ||||||
|  |     csv_files = sorted(root.glob(pattern)) | ||||||
|  |     if not csv_files: | ||||||
|  |         print("No CSV files found.") | ||||||
|  |         return | ||||||
|  |  | ||||||
|  |     table_rows = analyse(csv_files, args.sample) | ||||||
|  |  | ||||||
|  |     if _USE_TABULATE: | ||||||
|  |         print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f")) | ||||||
|  |     else: | ||||||
|  |         print(pd.DataFrame(table_rows).to_string(index=False)) | ||||||
|  |  | ||||||
|  |     print( | ||||||
|  |         "\nLegend:\n  • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)" | ||||||
|  |         "\n  • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info" | ||||||
|  |         "\n  • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant." | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										14
									
								
								setup.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								setup.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | # Creates the directory layout: | ||||||
|  | #   data/ | ||||||
|  | #     tar/ | ||||||
|  | #     pcap/ | ||||||
|  | #     processed/ | ||||||
|  |  | ||||||
|  | set -euo pipefail | ||||||
|  |  | ||||||
|  | root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||||
|  |  | ||||||
|  | mkdir -p "$root"/data/{tar,pcap,processed,combined} | ||||||
|  |  | ||||||
|  | echo "Directory structure ready under $root/data/" | ||||||
		Reference in New Issue
	
	Block a user