mirror of
				https://github.com/ltcptgeneral/IdealRMT-DecisionTrees.git
				synced 2025-10-24 21:19:20 +00:00 
			
		
		
		
	Compare commits
	
		
			21 Commits
		
	
	
		
			23867747cd
			...
			jai_runs
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | c8a0b18abf | ||
| 2ad40946d1 | |||
| 50075b1acc | |||
|  | 1585399c7d | ||
| 8301998da3 | |||
| 3b2d6b3186 | |||
|  | 24fc2ed6f7 | ||
|  | fda251f051 | ||
| 541538fcfe | |||
|  | afc882a569 | ||
| 6de3807fe2 | |||
|  | fc16d3c586 | ||
| 7bee40ecf9 | |||
|  | e811171a73 | ||
| 61a451b82d | |||
| c73de36c70 | |||
| fadeab8a99 | |||
| c208037ae9 | |||
| ae3128f6e8 | |||
| 25e5a86a43 | |||
| d3fe6efd47 | 
							
								
								
									
										2
									
								
								.gitattributes
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.gitattributes
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| # force LF for any shell script | ||||
| *.sh text eol=lf | ||||
							
								
								
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,4 +1,6 @@ | ||||
| data.* | ||||
| __pycache__ | ||||
| tree.json | ||||
| compressed_tree.json | ||||
| *.json | ||||
| data/* | ||||
| .DS_Store | ||||
| .ipynb_checkpoints/ | ||||
|   | ||||
							
								
								
									
										152
									
								
								CompressedTreeParser.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										152
									
								
								CompressedTreeParser.ipynb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,152 @@ | ||||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 138, | ||||
|    "id": "938dec51", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import numpy as np\n", | ||||
|     "import pandas as pd\n", | ||||
|     "import argparse\n", | ||||
|     "from sklearn.tree import DecisionTreeClassifier, plot_tree, _tree\n", | ||||
|     "from sklearn.metrics import accuracy_score\n", | ||||
|     "from sklearn.tree import export_graphviz\n", | ||||
|     "import pydotplus\n", | ||||
|     "from matplotlib import pyplot as plt\n", | ||||
|     "from labels import mac_to_label\n", | ||||
|     "import json\n", | ||||
|     "import math" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 139, | ||||
|    "id": "442624c7", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "Set1 = pd.read_csv('data/combined/data.csv').values.tolist()\n", | ||||
|     "X = [i[0:3] for i in Set1]\n", | ||||
|     "Y =[i[3] for i in Set1]" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 142, | ||||
|    "id": "12ad454d", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "{'0': 20, '1': 20, '2': 9, '3': 20, '4': 0, '5': 13, '6': 20, '7': 0, '8': 12, '9': 4, '10': 20, '11': 4, '12': 1, '13': 16, '14': 20, '15': 2, '16': 20, '17': 0, '18': 20, '19': 20, '20': 20, '21': 20, '22': 20, '23': 1, '24': 2, '25': 20, '26': 13, '27': 11, '28': 20, '29': 20}\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "predict_Yt = []\n", | ||||
|     "index=0\n", | ||||
|     "\n", | ||||
|     "with open('compressed_tree.json', 'r') as file:\n", | ||||
|     "    data = json.load(file)\n", | ||||
|     "    classes = data[\"classes\"]\n", | ||||
|     "    for x in X:\n", | ||||
|     "        counter = 0\n", | ||||
|     "        class_set = []\n", | ||||
|     "        paths_set = []\n", | ||||
|     "        features = [\"protocol\", \"src\", \"dst\"]\n", | ||||
|     "        for feature in features:\n", | ||||
|     "            if feature in data[\"layers\"]:\n", | ||||
|     "                for node in data['layers'][feature]:\n", | ||||
|     "                    if node['min'] is None:\n", | ||||
|     "                        if x[counter] <= node['max']:\n", | ||||
|     "                            class_set.append(node['classes'])\n", | ||||
|     "                            paths_set.append(node[\"paths\"])\n", | ||||
|     "                            break #is this an issue?\n", | ||||
|     "                        else:\n", | ||||
|     "                            continue\n", | ||||
|     "                    elif node['max'] is None:\n", | ||||
|     "                        if node['min'] < x[counter]:\n", | ||||
|     "                            class_set.append(node['classes'])\n", | ||||
|     "                            paths_set.append(node[\"paths\"])\n", | ||||
|     "                            break #is this an issue?\n", | ||||
|     "                        else:\n", | ||||
|     "                            continue\n", | ||||
|     "                    elif node['min'] < x[counter] and x[counter] <= node['max']:\n", | ||||
|     "                        class_set.append(node['classes'])\n", | ||||
|     "                        paths_set.append(node[\"paths\"])\n", | ||||
|     "                        break #is this an issue?\n", | ||||
|     "\n", | ||||
|     "            counter += 1\n", | ||||
|     "        result = set(class_set[0])\n", | ||||
|     "        paths = set(paths_set[0])\n", | ||||
|     "        for s in class_set[1:]:\n", | ||||
|     "            result.intersection_update(s)\n", | ||||
|     "        for s in paths_set[1:]:\n", | ||||
|     "            paths.intersection_update(s)\n", | ||||
|     "\n", | ||||
|     "        #predict_Yt.append(list(result))\n", | ||||
|     "        #print(result)\n", | ||||
|     "        if len(paths) != 1:\n", | ||||
|     "            print(paths)\n", | ||||
|     "            print(x)\n", | ||||
|     "            print(result)\n", | ||||
|     "        assert len(paths) == 1\n", | ||||
|     "        path = list(paths)[0]\n", | ||||
|     "        pred = data[\"path_to_class\"][str(path)]\n", | ||||
|     "        pred_class = classes[pred]\n", | ||||
|     "        predict_Yt.append(pred_class)\n", | ||||
|     "        \n", | ||||
|     "        index += 1" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 143, | ||||
|    "id": "8b4c56b6", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "0.8410252791654538\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "correct = 0\n", | ||||
|     "for i in range(len(Y)):\n", | ||||
|     "    prediction = predict_Yt[i]\n", | ||||
|     "    if prediction != None and Y[i] == prediction:\n", | ||||
|     "        correct += 1\n", | ||||
|     "\n", | ||||
|     "print(correct / len(Y))" | ||||
|    ] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "Python 3", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.12.7" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 5 | ||||
| } | ||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @@ -89,7 +89,7 @@ | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "switch", | ||||
|    "display_name": "Python 3 (ipykernel)", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
| @@ -103,7 +103,7 @@ | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.12.7" | ||||
|    "version": "3.12.9" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|   | ||||
							
								
								
									
										10
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,17 +2,21 @@ | ||||
|  | ||||
| Run `pip install -r requirements.txt` | ||||
|  | ||||
| Run `setup.sh` | ||||
|  | ||||
| # Tree Generation | ||||
|  | ||||
| ## Download Dataset | ||||
|  | ||||
| Download the *September 22 2016* dataset from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc | ||||
| Download the *September 22 2016* dataset (or others) from: https://iotanalytics.unsw.edu.au/iottraces.html#bib18tmc | ||||
|  | ||||
| Rename the file as data.pcap | ||||
| Place these into the `data/tar` folder. | ||||
|  | ||||
| Run `extract_tars.sh` which will extract and place the `.pcap` files at the corresponding location inside `data/pcap`. | ||||
|  | ||||
| ## Preprocessing Dataset | ||||
|  | ||||
| Run `ExtractDataset.ipynb`, this will take a few minutes | ||||
| Run `extract_all_datasets.py` which will extract the data from each file in `data/pcap` and turn it into the corresponding `.csv` file inside `data/processed`. This will take a few minutes per file. Combine the data under `data/csv` using `combine_csv.py`. This will overwrite `data/combined/data.csv` which you can use for the decision tree. | ||||
|  | ||||
| ## Training | ||||
|  | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 1, | ||||
|    "execution_count": 73, | ||||
|    "id": "ec310f34", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -14,7 +14,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 2, | ||||
|    "execution_count": 74, | ||||
|    "id": "5b54797e", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -28,7 +28,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "execution_count": 75, | ||||
|    "id": "a38fdb8a", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -38,14 +38,14 @@ | ||||
|     "i = 0\n", | ||||
|     "\n", | ||||
|     "path_ids = set()\n", | ||||
|     "path_classes = set()\n", | ||||
|     "path_classes = tree[\"classes\"]\n", | ||||
|     "\n", | ||||
|     "# for each path in the tree\n", | ||||
|     "for path in paths:\n", | ||||
|     "\t# assign a path id \n", | ||||
|     "\tpath[\"id\"] = i\n", | ||||
|     "\tpath_ids.add(i)\n", | ||||
|     "\tpath_classes.add(path[\"classification\"])\n", | ||||
|     "\t#path_classes.add(path[\"classification\"])\n", | ||||
|     "\ti += 1\t\n", | ||||
|     "\t# for each condition\n", | ||||
|     "\tconditions = path[\"conditions\"]\n", | ||||
| @@ -60,7 +60,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 4, | ||||
|    "execution_count": 76, | ||||
|    "id": "2fd4f738", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -83,7 +83,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 5, | ||||
|    "execution_count": 77, | ||||
|    "id": "98cde024", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -109,13 +109,13 @@ | ||||
|     "\t\tvalue = condition[\"value\"]\n", | ||||
|     "\n", | ||||
|     "\t\t# move the min/max for the corresponding feature in compressed\n", | ||||
|     "\t\tif operation == \"<=\" and compressed[feature][\"min\"] is None:\n", | ||||
|     "\t\tif operation == \"<=\" and compressed[feature][\"max\"] is None:\n", | ||||
|     "\t\t\tcompressed[feature][\"max\"] = value\n", | ||||
|     "\t\telif operation == \">\" and compressed[feature][\"max\"] is None:\n", | ||||
|     "\t\telif operation == \">\" and compressed[feature][\"min\"] is None:\n", | ||||
|     "\t\t\tcompressed[feature][\"min\"] = value\n", | ||||
|     "\t\telif operation == \"<=\" and value < compressed[feature][\"min\"]:\n", | ||||
|     "\t\telif operation == \"<=\" and value < compressed[feature][\"max\"]:\n", | ||||
|     "\t\t\tcompressed[feature][\"max\"] = value\n", | ||||
|     "\t\telif operation == \">\" and value > compressed[feature][\"max\"]:\n", | ||||
|     "\t\telif operation == \">\" and value > compressed[feature][\"min\"]:\n", | ||||
|     "\t\t\tcompressed[feature][\"min\"] = value\n", | ||||
|     "\n", | ||||
|     "\tpath[\"compressed\"] = compressed" | ||||
| @@ -123,7 +123,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 6, | ||||
|    "execution_count": 78, | ||||
|    "id": "b6fbadbf", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -171,16 +171,19 @@ | ||||
|     "\t\t# for each bucket which encompases the condition's range, add this path's id to the sets \n", | ||||
|     "\t\ti = 0\n", | ||||
|     "\t\tfor bp in breakpoints[feature_name]:\n", | ||||
|     "\t\t\tin_range = is_in_range(bp, lower, upper)\n", | ||||
|     "\t\t\tif in_range:\n", | ||||
|     "\t\t\tif is_in_range(bp, lower, upper):\n", | ||||
|     "\t\t\t\tbuckets_id[feature_name][i].add(ID)\n", | ||||
|     "\t\t\t\tbuckets_class[feature_name][i].add(Class)\n", | ||||
|     "\t\t\ti += 1" | ||||
|     "\t\t\ti += 1\n", | ||||
|     "\n", | ||||
|     "\t\tif is_in_range(bp+1, lower, upper):\n", | ||||
|     "\t\t\tbuckets_id[feature_name][i].add(ID)\n", | ||||
|     "\t\t\tbuckets_class[feature_name][i].add(Class)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "execution_count": 79, | ||||
|    "id": "0a767971", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -198,28 +201,34 @@ | ||||
|     "\tfor i in range(1, len(buckets_id[feature_name]) - 1):\n", | ||||
|     "\t\tlower = breakpoints[feature_name][i-1]\n", | ||||
|     "\t\tupper = breakpoints[feature_name][i]\n", | ||||
|     "\t\tmembers = buckets_id[feature_name][i]\n", | ||||
|     "\t\tpaths = buckets_id[feature_name][i]\n", | ||||
|     "\t\tclasses = buckets_class[feature_name][i]\n", | ||||
|     "\t\t#print(f\"{feature_name} = [{lower}, {upper}]: {buckets[feature_name][i]}\")\n", | ||||
|     "\t\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n", | ||||
|     "\tlower = breakpoints[feature_name][len(breakpoints[feature_name]) - 1]\n", | ||||
|     "\tupper = None\n", | ||||
|     "\tmembers = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n", | ||||
|     "\tpaths = buckets_id[feature_name][len(buckets_id[feature_name]) - 1]\n", | ||||
|     "\tclasses = buckets_class[feature_name][len(buckets_class[feature_name]) - 1]\n", | ||||
|     "\t#print(f\"{feature_name} = [{lower}, {upper}]: {members}\")\n", | ||||
|     "\tcompressed_layers[feature_name].append({\"min\": lower, \"max\": upper, \"paths\": paths, \"classes\": classes})\n", | ||||
|     "\t#print(\"=\"*40)\n", | ||||
|     "\n", | ||||
|     "path_to_class = {}\n", | ||||
|     "for i in range(len(tree[\"paths\"])):\n", | ||||
|     "    path = tree[\"paths\"][i]\n", | ||||
|     "    path_to_class[path[\"id\"]] = path[\"classification\"]\n", | ||||
|     "\n", | ||||
|     "compressed_tree = {\n", | ||||
|     "\t\"paths\": path_ids,\n", | ||||
|     "\t\"classes\": path_classes,\n", | ||||
|     "\t\"layers\": compressed_layers,\n", | ||||
|     "    \"path_to_class\": path_to_class,\n", | ||||
|     "}" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 8, | ||||
|    "execution_count": 80, | ||||
|    "id": "561b0bc1", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @@ -238,7 +247,7 @@ | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "switch", | ||||
|    "display_name": "Python 3 (ipykernel)", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
| @@ -252,7 +261,7 @@ | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.12.7" | ||||
|    "version": "3.12.9" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|   | ||||
							
								
								
									
										412
									
								
								TreeToRMT.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										412
									
								
								TreeToRMT.ipynb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,412 @@ | ||||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 1, | ||||
|    "id": "58fc6db9", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import json\n", | ||||
|     "import math" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 2, | ||||
|    "id": "e07be4b3", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "f = open(\"compressed_tree.json\")\n", | ||||
|     "tree = json.loads(f.read())\n", | ||||
|     "layers = tree[\"layers\"]\n", | ||||
|     "classes = tree[\"classes\"]\n", | ||||
|     "f.close()" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "id": "1516ff91", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "field_width = {\n", | ||||
|     "\t\"src\": 16,\n", | ||||
|     "\t\"dst\": 16,\n", | ||||
|     "\t\"protocl\": 8,\n", | ||||
|     "}" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "f9193827", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Worst Case RMT" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 4, | ||||
|    "id": "5e37cfc5", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def worst_case_rmt(tree):\n", | ||||
|     "\trmt = []\n", | ||||
|     "\tstep = 0\n", | ||||
|     "\n", | ||||
|     "\ttcam_bits = 0\n", | ||||
|     "\tram_bits = 0\n", | ||||
|     "\n", | ||||
|     "\tfor layer in layers:\n", | ||||
|     "\t\tnum_ranges = len(layers[layer])\n", | ||||
|     "\t\t# assume that each range requires all of 2*k prefixes when performing prefix expansion\n", | ||||
|     "\t\t# therefore there are 2*k * R for R ranges and width k\n", | ||||
|     "\t\tnum_prefixes = 2 * field_width[layer] * num_ranges\n", | ||||
|     "\t\tprefix_width = field_width[layer]\n", | ||||
|     "\n", | ||||
|     "\t\ttcam = {\n", | ||||
|     "\t\t\t\"id\": f\"{layer}_range\",\n", | ||||
|     "\t\t\t\"step\": step,\n", | ||||
|     "\t\t\t\"match\": \"ternary\",\n", | ||||
|     "\t\t\t\"entries\": num_prefixes,\n", | ||||
|     "\t\t\t\"key_size\": prefix_width\n", | ||||
|     "\t\t}\n", | ||||
|     "\t\ttcam_bits += num_prefixes * prefix_width\n", | ||||
|     "\n", | ||||
|     "\t\t# assume basic pointer reuse for metadata storage\n", | ||||
|     "\t\tram = {\n", | ||||
|     "\t\t\t\"id\": f\"{layer}_meta\",\n", | ||||
|     "\t\t\t\"step\": step,\n", | ||||
|     "\t\t\t\"match\": \"exact\",\n", | ||||
|     "\t\t\t\"method\": \"index\",\n", | ||||
|     "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n", | ||||
|     "\t\t\t\"data_size\": len(classes)\n", | ||||
|     "\t\t}\n", | ||||
|     "\t\tram_bits += num_ranges * len(classes)\n", | ||||
|     "\n", | ||||
|     "\t\trmt.append(tcam)\n", | ||||
|     "\t\trmt.append(ram)\n", | ||||
|     "\n", | ||||
|     "\t\tstep += 1\n", | ||||
|     "\n", | ||||
|     "\treturn rmt, tcam_bits, ram_bits\n", | ||||
|     "\n", | ||||
|     "x, tcam_bits, ram_bits = worst_case_rmt(tree)\n", | ||||
|     "f = open(\"worst_case_rmt.json\", \"w+\")\n", | ||||
|     "f.write(json.dumps(x, indent=4))\n", | ||||
|     "f.close()" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 5, | ||||
|    "id": "0dc1d6d4", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "TCAM mapping: \n", | ||||
|       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||
|       "SRAM mapping: \n", | ||||
|       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||
|       "id mapping: \n", | ||||
|       "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n", | ||||
|       "TCAM bits: 13184\n", | ||||
|       "RAM bits:  504\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n", | ||||
|     "print(f\"TCAM bits: {tcam_bits}\")\n", | ||||
|     "print(f\"RAM bits:  {ram_bits}\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "2a628655", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Naive Range Expansion " | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 6, | ||||
|    "id": "fb9febe9", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# shamelessly stolen from: https://github.com/autolyticus/range-to-prefix/blob/master/rangetoprefix.C\n", | ||||
|     "\n", | ||||
|     "def int_to_bin(i, width):\n", | ||||
|     "\treturn bin(i)[2:].zfill(width)\n", | ||||
|     "\n", | ||||
|     "def increment_dc(pfx):\n", | ||||
|     "\tidx = pfx.find(\"*\")\n", | ||||
|     "\tif idx == -1:\n", | ||||
|     "\t\tidx = len(pfx)\n", | ||||
|     "\tidx = idx - 1\n", | ||||
|     "\t#print(pfx, pfx[:idx])\n", | ||||
|     "\treturn pfx[:idx] + \"*\" + pfx[idx+1:]\n", | ||||
|     "\t\n", | ||||
|     "def can_merge(pfx_a, pfx_b):\n", | ||||
|     "\tpfx_a = pfx_a.replace(\"*\", \"\")\n", | ||||
|     "\tpfx_b = pfx_b.replace(\"*\", \"\")\n", | ||||
|     "\treturn pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1]\n", | ||||
|     "\n", | ||||
|     "def merge(pfx_a, prefixes):\n", | ||||
|     "\tpfx_a = increment_dc(pfx_a)\n", | ||||
|     "\tprefixes[-1] = pfx_a\n", | ||||
|     "\n", | ||||
|     "\tfor i in range(len(prefixes) - 2, -1, -1):\n", | ||||
|     "\t\tif can_merge(prefixes[i], prefixes[i+1]):\n", | ||||
|     "\t\t\tprefixes.pop()\n", | ||||
|     "\t\t\tpfx = increment_dc(prefixes[i])\n", | ||||
|     "\t\t\tprefixes[i] = pfx\n", | ||||
|     "\n", | ||||
|     "def convert_range(lower, upper, width):\n", | ||||
|     "\tprefixes = []\n", | ||||
|     "\tprefix = int_to_bin(lower, width)\n", | ||||
|     "\tprefixes.append(prefix)\n", | ||||
|     "\tnorm_upper = min(upper, 2**width-1)\n", | ||||
|     "\tfor i in range(lower+1, norm_upper+1):\n", | ||||
|     "\t\tprefix = int_to_bin(i, width)\n", | ||||
|     "\t\tif can_merge(prefix, prefixes[-1]):\n", | ||||
|     "\t\t\tmerge(prefix, prefixes)\n", | ||||
|     "\t\telse:\n", | ||||
|     "\t\t\tprefixes.append(prefix)\n", | ||||
|     "\treturn prefixes" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "id": "55167c28", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def naive_rmt(tree):\n", | ||||
|     "\trmt = []\n", | ||||
|     "\tstep = 0\n", | ||||
|     "\n", | ||||
|     "\ttcam_bits = 0\n", | ||||
|     "\tram_bits = 0\n", | ||||
|     "\n", | ||||
|     "\tfor layer in layers:\n", | ||||
|     "\t\tnum_prefixes = 0\n", | ||||
|     "\t\tprefix_width = field_width[layer]\n", | ||||
|     "\t\t# for each range in the layer, convert the ranges to prefixes using naive range expansion\n", | ||||
|     "\t\tfor r in layers[layer]:\n", | ||||
|     "\t\t\tif r[\"min\"] == None:\n", | ||||
|     "\t\t\t\tr[\"min\"] = 0\n", | ||||
|     "\t\t\telif r[\"max\"] == None:\n", | ||||
|     "\t\t\t\tr[\"max\"] = 2 ** prefix_width\n", | ||||
|     "\t\t\tprefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n", | ||||
|     "\t\t\tr[\"prefixes\"] = prefixes\n", | ||||
|     "\t\t\tnum_prefixes += len(prefixes)\n", | ||||
|     "\t\t\ttcam_bits += len(prefixes) * prefix_width\n", | ||||
|     "\n", | ||||
|     "\t\ttcam = {\n", | ||||
|     "\t\t\t\"id\": f\"{layer}_range\",\n", | ||||
|     "\t\t\t\"step\": step,\n", | ||||
|     "\t\t\t\"match\": \"ternary\",\n", | ||||
|     "\t\t\t\"entries\": num_prefixes,\n", | ||||
|     "\t\t\t\"key_size\": prefix_width,\n", | ||||
|     "\t\t\t\"ranges\": layers[layer]\n", | ||||
|     "\t\t}\n", | ||||
|     "\n", | ||||
|     "\t\tnum_ranges = len(layers[layer])\n", | ||||
|     "\t\t# assume no pointer reuse for metadata storage\n", | ||||
|     "\t\tram = {\n", | ||||
|     "\t\t\t\"id\": f\"{layer}_meta\",\n", | ||||
|     "\t\t\t\"step\": step,\n", | ||||
|     "\t\t\t\"match\": \"exact\",\n", | ||||
|     "\t\t\t\"method\": \"index\",\n", | ||||
|     "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n", | ||||
|     "\t\t\t\"data_size\": len(classes)\n", | ||||
|     "\t\t}\n", | ||||
|     "\t\tram_bits += num_ranges * len(classes)\n", | ||||
|     "\n", | ||||
|     "\t\trmt.append(tcam)\n", | ||||
|     "\t\trmt.append(ram)\n", | ||||
|     "\n", | ||||
|     "\t\tstep += 1\n", | ||||
|     "\n", | ||||
|     "\treturn rmt, tcam_bits, ram_bits\n", | ||||
|     "\n", | ||||
|     "x, tcam_bits, ram_bits = naive_rmt(tree)\n", | ||||
|     "f = open(\"naive_rmt.json\", \"w+\")\n", | ||||
|     "f.write(json.dumps(x, indent=4))\n", | ||||
|     "f.close()\n" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 8, | ||||
|    "id": "48011528", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "TCAM mapping: \n", | ||||
|       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||
|       "SRAM mapping: \n", | ||||
|       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||
|       "id mapping: \n", | ||||
|       "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n", | ||||
|       "TCAM bits: 3320\n", | ||||
|       "RAM bits:  504\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "! command python3 ideal-rmt-simulator/sim.py naive_rmt.json\n", | ||||
|     "print(f\"TCAM bits: {tcam_bits}\")\n", | ||||
|     "print(f\"RAM bits:  {ram_bits}\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "2504b1ba", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Priority Aware Prefix Expansion" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 9, | ||||
|    "id": "64b7271e", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# for this technique, we note that given disjoint ranges [0,a][a,b],[b,c] ...\n", | ||||
|     "# then if using a TCAM that selects the first matching prefix, then [0,a],[0,b],[0,c] would be equivalent\n", | ||||
|     "# this is because if for some k<a, even though the range [0,b] could be selected, as long as the prefixes for [0,a] are before [0,b] then the correct prefix will still be selected\n", | ||||
|     "\n", | ||||
|     "def priority_aware(tree):\n", | ||||
|     "\trmt = []\n", | ||||
|     "\tstep = 0\n", | ||||
|     "\n", | ||||
|     "\ttcam_bits = 0\n", | ||||
|     "\tram_bits = 0\n", | ||||
|     "\n", | ||||
|     "\tfor layer in layers:\n", | ||||
|     "\t\tnum_prefixes = 0\n", | ||||
|     "\t\tprefix_width = field_width[layer]\n", | ||||
|     "\t\t# for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0\n", | ||||
|     "\t\t# then check which set of prefixes would be better\n", | ||||
|     "\t\t# we will assume the ranges are already disjoin and in the correct order\n", | ||||
|     "\t\tfor r in layers[layer]:\n", | ||||
|     "\t\t\tif r[\"min\"] == None:\n", | ||||
|     "\t\t\t\tr[\"min\"] = 0\n", | ||||
|     "\t\t\telif r[\"max\"] == None:\n", | ||||
|     "\t\t\t\tr[\"max\"] = 2 ** prefix_width\n", | ||||
|     "\t\t\tregular_prefixes = convert_range(r[\"min\"], r[\"max\"], prefix_width)\n", | ||||
|     "\t\t\tzero_start_prefixes = convert_range(0, r[\"max\"], prefix_width)\n", | ||||
|     "\n", | ||||
|     "\t\t\tif len(regular_prefixes) <= len(zero_start_prefixes):\n", | ||||
|     "\t\t\t\tpfx_type = \"exact\"\n", | ||||
|     "\t\t\t\tprefixes = regular_prefixes\n", | ||||
|     "\t\t\telse:\n", | ||||
|     "\t\t\t\tpfx_type = \"zero\"\n", | ||||
|     "\t\t\t\tprefixes = zero_start_prefixes\n", | ||||
|     "\n", | ||||
|     "\t\t\tr[\"prefixes\"] = prefixes\n", | ||||
|     "\t\t\tr[\"prefix_type\"] = pfx_type\n", | ||||
|     "\t\t\tnum_prefixes += len(prefixes)\n", | ||||
|     "\t\t\ttcam_bits += len(prefixes) * prefix_width\n", | ||||
|     "\n", | ||||
|     "\t\ttcam = {\n", | ||||
|     "\t\t\t\"id\": f\"{layer}_range\",\n", | ||||
|     "\t\t\t\"step\": step,\n", | ||||
|     "\t\t\t\"match\": \"ternary\",\n", | ||||
|     "\t\t\t\"entries\": num_prefixes,\n", | ||||
|     "\t\t\t\"key_size\": prefix_width,\n", | ||||
|     "\t\t\t\"ranges\": layers[layer]\n", | ||||
|     "\t\t}\n", | ||||
|     "\n", | ||||
|     "\t\tnum_ranges = len(layers[layer])\n", | ||||
|     "\t\t# assume no pointer reuse for metadata storage\n", | ||||
|     "\t\tram = {\n", | ||||
|     "\t\t\t\"id\": f\"{layer}_meta\",\n", | ||||
|     "\t\t\t\"step\": step,\n", | ||||
|     "\t\t\t\"match\": \"exact\",\n", | ||||
|     "\t\t\t\"method\": \"index\",\n", | ||||
|     "\t\t\t\"key_size\": math.ceil(math.log2(num_ranges)),\n", | ||||
|     "\t\t\t\"data_size\": len(classes)\n", | ||||
|     "\t\t}\n", | ||||
|     "\t\tram_bits += num_ranges * len(classes)\n", | ||||
|     "\n", | ||||
|     "\t\trmt.append(tcam)\n", | ||||
|     "\t\trmt.append(ram)\n", | ||||
|     "\n", | ||||
|     "\t\tstep += 1\n", | ||||
|     "\n", | ||||
|     "\treturn rmt, tcam_bits, ram_bits\n", | ||||
|     "\n", | ||||
|     "x, tcam_bits, ram_bits = priority_aware(tree)\n", | ||||
|     "f = open(\"priority_aware.json\", \"w+\")\n", | ||||
|     "f.write(json.dumps(x, indent=4))\n", | ||||
|     "f.close()" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 10, | ||||
|    "id": "cd706e41", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "TCAM mapping: \n", | ||||
|       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||
|       "SRAM mapping: \n", | ||||
|       "[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", | ||||
|       "id mapping: \n", | ||||
|       "[['dst_range', 'dst_meta'], ['src_range', 'src_meta'], ['protocl_range', 'protocl_meta'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]\n", | ||||
|       "TCAM bits: 2152\n", | ||||
|       "RAM bits:  504\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "! command python3 ideal-rmt-simulator/sim.py priority_aware.json\n", | ||||
|     "print(f\"TCAM bits: {tcam_bits}\")\n", | ||||
|     "print(f\"RAM bits:  {ram_bits}\")" | ||||
|    ] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "Python 3 (ipykernel)", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.12.7" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 5 | ||||
| } | ||||
							
								
								
									
										74
									
								
								combine.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								combine.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,74 @@ | ||||
| #!/usr/bin/env python3 | ||||
| """combined.py | ||||
|  | ||||
| Concatenate every CSV that matches the pattern | ||||
|     data/processed/<name>/<name>.csv | ||||
| into a single file: | ||||
|     data/combined/data.csv | ||||
|  | ||||
| The script streams each source CSV in 1‑Mio‑row chunks so memory stays low. | ||||
| Typos in the historic column names (protocl/classfication) are fixed on‑the‑fly. | ||||
|  | ||||
| Usage | ||||
| ----- | ||||
| python combined.py | ||||
|  | ||||
| You can optionally supply a different root directory: | ||||
| python combined.py --root other/processed_dir --out other/combined/data.csv | ||||
| """ | ||||
| from __future__ import annotations | ||||
|  | ||||
| import argparse | ||||
| from pathlib import Path | ||||
| import os | ||||
| import pandas as pd | ||||
|  | ||||
| CHUNK = 1_000_000  # rows per read_csv chunk | ||||
|  | ||||
|  | ||||
| def fix_cols(df: pd.DataFrame) -> pd.DataFrame: | ||||
|     """Rename legacy columns to canonical names.""" | ||||
|     return df.rename( | ||||
|         columns={"protocl": "protocol", "classfication": "classification"} | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def find_source_csvs(proc_root: Path): | ||||
|     """Yield CSV paths that exactly match processed/<name>/<name>.csv.""" | ||||
|     for sub in sorted(proc_root.iterdir()): | ||||
|         if not sub.is_dir(): | ||||
|             continue | ||||
|         target = sub / f"{sub.name}.csv" | ||||
|         if target.exists(): | ||||
|             yield target | ||||
|  | ||||
|  | ||||
| def combine(proc_root: Path, out_path: Path): | ||||
|     out_path.parent.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
|     first_write = True | ||||
|     for csv_path in find_source_csvs(proc_root): | ||||
|         print(f"→ adding {csv_path.relative_to(proc_root.parent)}") | ||||
|         for chunk in pd.read_csv(csv_path, chunksize=CHUNK): | ||||
|             chunk = fix_cols(chunk) | ||||
|             chunk.to_csv( | ||||
|                 out_path, | ||||
|                 mode="w" if first_write else "a", | ||||
|                 header=first_write, | ||||
|                 index=False, | ||||
|             ) | ||||
|             first_write = False | ||||
|     print(f"✓ combined CSV written to {out_path}") | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     p = argparse.ArgumentParser(description="Combine processed CSVs into one.") | ||||
|     p.add_argument("--root", default="data/processed", help="processed dir root") | ||||
|     p.add_argument("--out", default="data/combined/data.csv", help="output CSV") | ||||
|     args = p.parse_args() | ||||
|  | ||||
|     combine(Path(args.root).expanduser(), Path(args.out).expanduser()) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										80
									
								
								extract_all_datasets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								extract_all_datasets.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| #!/usr/bin/env python3 | ||||
| from pathlib import Path | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from labels import mac_to_label | ||||
| from tqdm import tqdm | ||||
| import os | ||||
|  | ||||
| ROOT       = Path(__file__).resolve().parent | ||||
| PCAP_DIR   = ROOT / "data" / "pcap" | ||||
| CSV_DIR    = ROOT / "data" / "processed" | ||||
| CSV_DIR.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
| BATCH = 100_000   # packets per chunk | ||||
|  | ||||
| from scapy.all import rdpcap | ||||
|  | ||||
|  | ||||
| def process_pcap(pcap_path: str, csv_path: str) -> None: | ||||
|     all_packets = rdpcap(pcap_path) | ||||
|  | ||||
|     print("rdpcap done", flush=True) | ||||
|     results = [] | ||||
|     for packet in tqdm(all_packets): | ||||
|         size = len(packet) | ||||
|         try: | ||||
|             proto = packet.proto | ||||
|         except AttributeError: | ||||
|             proto = 0 | ||||
|         try: | ||||
|             sport = packet.sport | ||||
|             dport = packet.dport | ||||
|         except AttributeError: | ||||
|             sport = 0 | ||||
|             dport = 0 | ||||
|  | ||||
|         proto = int(proto) | ||||
|         sport = int(sport) | ||||
|         dport = int(dport) | ||||
|  | ||||
|         if "Ether" in packet: | ||||
|             eth_dst = packet["Ether"].dst | ||||
|             if eth_dst in mac_to_label: | ||||
|                 classification = mac_to_label[eth_dst] | ||||
|             else: | ||||
|                 classification = "other" | ||||
|         else: | ||||
|             classification = "other" | ||||
|  | ||||
|         metric = [proto,sport,dport,classification] | ||||
|         results.append(metric) | ||||
|     results = (np.array(results)).T | ||||
|  | ||||
|     # store the features in the dataframe | ||||
|     dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]}) | ||||
|     columns = ['protocl','src','dst','classfication'] | ||||
|  | ||||
|     # save the dataframe to the csv file, if not exsit, create one. | ||||
|     if os.path.exists(csv_path): | ||||
|         dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False) | ||||
|     else: | ||||
|         dataframe.to_csv(csv_path,index=False,sep=',',columns = columns) | ||||
|          | ||||
|     print("Done") | ||||
|  | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     for pcap in sorted(PCAP_DIR.rglob("*.pcap")): | ||||
|         rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv") | ||||
|         csv_path = CSV_DIR / rel_csv | ||||
|         if csv_path.exists(): | ||||
|             print(f"Skip {rel_csv} (CSV exists)") | ||||
|             continue | ||||
|         print(f"Processing {rel_csv}") | ||||
|         csv_path.parent.mkdir(parents=True, exist_ok=True) | ||||
|         process_pcap(str(pcap), str(csv_path)) | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										50
									
								
								extract_tars.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								extract_tars.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| #!/usr/bin/env bash | ||||
| # Usage: extract_all.sh SOURCE_DIR TARGET_DIR | ||||
| # For every .tar, .tar.gz, .tgz, .tar.bz2, .tar.xz in SOURCE_DIR: | ||||
| #   1. Create TARGET_DIR/<name>/ | ||||
| #   2. If TARGET_DIR/<name>/<name>.pcap already exists, skip the archive. | ||||
| #   3. Otherwise, extract the archive into its own folder. | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| if [[ $# -ne 2 ]]; then | ||||
|   echo "Usage: $0 SOURCE_DIR TARGET_DIR" >&2 | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| src_dir="$1" | ||||
| dst_dir="$2" | ||||
| mkdir -p "$dst_dir" | ||||
|  | ||||
| # Strip common extensions to recover the base name | ||||
| strip_ext() { | ||||
|   local n="$1" | ||||
|   n=${n%.tar.gz}; n=${n%.tgz}; n=${n%.tar.bz2}; n=${n%.tar.xz}; n=${n%.tar} | ||||
|   echo "$n" | ||||
| } | ||||
|  | ||||
| shopt -s nullglob | ||||
| for archive in "$src_dir"/*.tar{,.gz,.bz2,.xz} "$src_dir"/*.tgz; do | ||||
|   base=$(basename "$archive") | ||||
|   name=$(strip_ext "$base") | ||||
|   out_dir="$dst_dir/$name" | ||||
|   key_file="$out_dir/$name.pcap" | ||||
|  | ||||
|   if [[ -f "$key_file" ]]; then | ||||
|     echo "Skipping $archive  —  $key_file already present" | ||||
|     continue | ||||
|   fi | ||||
|  | ||||
|   echo "Extracting $archive into $out_dir" | ||||
|   mkdir -p "$out_dir" | ||||
|  | ||||
|   case "$archive" in | ||||
|     *.tar)          tar -xf "$archive" -C "$out_dir" ;; | ||||
|     *.tar.gz|*.tgz) tar -xzf "$archive" -C "$out_dir" ;; | ||||
|     *.tar.bz2)      tar -xjf "$archive" -C "$out_dir" ;; | ||||
|     *.tar.xz)       tar -xJf "$archive" -C "$out_dir" ;; | ||||
|     *)              echo "Unknown type: $archive" ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| echo "All archives processed." | ||||
| @@ -4,3 +4,4 @@ pandas | ||||
| scikit-learn | ||||
| pydotplus | ||||
| matplotlib | ||||
| scipy | ||||
							
								
								
									
										168
									
								
								run/decision_tree.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								run/decision_tree.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,168 @@ | ||||
| #!/usr/bin/env python3 | ||||
| """ | ||||
| Train a decision tree, optionally “nudge” its split thresholds, and | ||||
| export the result as JSON. | ||||
|  | ||||
| Usage examples | ||||
| -------------- | ||||
| # plain training, no nudging | ||||
| python build_tree.py --input data/combined/data.csv --output tree.json | ||||
|  | ||||
| # nudge every internal threshold, keeping only the top-2 bits | ||||
| python build_tree.py --input data/combined/data.csv --output tree.json \ | ||||
|                      --nudge --bits 2 | ||||
| """ | ||||
| import argparse | ||||
| import copy | ||||
| import json | ||||
| import math | ||||
| from pathlib import Path | ||||
|  | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from sklearn.metrics import accuracy_score | ||||
| from sklearn.tree import DecisionTreeClassifier, _tree | ||||
|  | ||||
| # ---------------------------------------------------------------------- | ||||
| # 1. command-line arguments | ||||
| # ---------------------------------------------------------------------- | ||||
| parser = argparse.ArgumentParser() | ||||
| parser.add_argument("--input",  "-i", help="CSV file with protocol,src,dst,label", default="../data/combined/data.csv") | ||||
| parser.add_argument("--output", "-o", help="Path for the exported JSON tree", default="tree.json") | ||||
| parser.add_argument("--depth",  "-d", type=int, default=5, | ||||
|                     help="Max depth of the decision tree (default: 5)") | ||||
| parser.add_argument("--nudge",  action="store_true", | ||||
|                     help="Enable threshold nudging") | ||||
| parser.add_argument("--bits",   type=int, default=2, | ||||
|                     help="Number of bits to keep when nudging (default: 2)") | ||||
| args = parser.parse_args() | ||||
|  | ||||
| # ---------------------------------------------------------------------- | ||||
| # 2. helper functions | ||||
| # ---------------------------------------------------------------------- | ||||
| def nudge_threshold_max_n_bits(threshold: float, n_bits: int) -> int: | ||||
|     """Remove n bits from each""" | ||||
|     threshold = math.floor(threshold) | ||||
|     if n_bits == 0: | ||||
|         return threshold | ||||
|      | ||||
|     mask = pow(2, 32) - 1 ^ ((1 << n_bits) - 1) | ||||
|     nudged_value = threshold & mask | ||||
|     if threshold & (1 << (n_bits - 1)): | ||||
|         nudged_value += (1 << (n_bits)) | ||||
|              | ||||
|     return nudged_value | ||||
|  | ||||
| def apply_nudging(tree: _tree.Tree, node_idx: int, n_bits: int) -> None: | ||||
|     """Post-order traversal that nudges every internal node’s threshold.""" | ||||
|     flag = False | ||||
|     if tree.children_left[node_idx] != -1: | ||||
|         apply_nudging(tree, tree.children_left[node_idx], n_bits) | ||||
|         flag = True | ||||
|     if tree.children_right[node_idx] != -1: | ||||
|         apply_nudging(tree, tree.children_right[node_idx], n_bits) | ||||
|         flag = True | ||||
|     if flag:    # internal node | ||||
|         tree.threshold[node_idx] = nudge_threshold_max_n_bits( | ||||
|             tree.threshold[node_idx], n_bits | ||||
|         ) | ||||
|  | ||||
| # output the tree | ||||
| def get_lineage(tree, feature_names): | ||||
|     data = {"features": {}, "paths": [], "classes": list(tree.classes_)} | ||||
|  | ||||
|     thresholds = tree.tree_.threshold | ||||
|     features   = [feature_names[i] for i in tree.tree_.feature] | ||||
|     left       = tree.tree_.children_left | ||||
|     right      = tree.tree_.children_right | ||||
|     value      = tree.tree_.value | ||||
|  | ||||
|     # -------- helper to climb up from a leaf to the root ----------- | ||||
|     def recurse(left, right, child, lineage=None): | ||||
|         if lineage is None: | ||||
|             lineage = [child]          # leaf marker (an int) | ||||
|         if child in left: | ||||
|             parent = np.where(left == child)[0].item() | ||||
|             split  = "l" | ||||
|         elif child in right: | ||||
|             parent = np.where(right == child)[0].item() | ||||
|             split  = "r" | ||||
|         else:                          # should never happen | ||||
|             return lineage | ||||
|  | ||||
|         lineage.append((parent, split, thresholds[parent], features[parent])) | ||||
|         if parent == 0: | ||||
|             return list(reversed(lineage)) | ||||
|         return recurse(left, right, parent, lineage) | ||||
|  | ||||
|     leaf_ids = np.where(left == -1)[0]             # indices of all leaves | ||||
|     for path_id, leaf in enumerate(leaf_ids): | ||||
|         clause = [] | ||||
|  | ||||
|         for node in recurse(left, right, leaf): | ||||
|             if not isinstance(node, tuple):        # skip the leaf marker | ||||
|                 continue | ||||
|  | ||||
|             direction, threshold, feature = node[1], node[2], node[3] | ||||
|             if direction == "l": | ||||
|                 clause.append( | ||||
|                     {"feature": feature, "operation": "<=", "value": threshold} | ||||
|                 ) | ||||
|             else: | ||||
|                 clause.append( | ||||
|                     {"feature": feature, "operation": ">",  "value": threshold} | ||||
|                 ) | ||||
|  | ||||
|         class_idx = int(np.argmax(value[leaf][0]))  # use the leaf itself | ||||
|         data["paths"].append( | ||||
|             {"conditions": clause, "classification": class_idx, "id": path_id} | ||||
|         ) | ||||
|  | ||||
|     # collect all thresholds per feature | ||||
|     for i, feat in enumerate(features): | ||||
|         if tree.tree_.feature[i] != _tree.TREE_UNDEFINED: | ||||
|             data["features"].setdefault(feat, []).append(thresholds[i]) | ||||
|  | ||||
|     return data | ||||
|  | ||||
|  | ||||
| class SetEncoder(json.JSONEncoder): | ||||
|     def default(self, obj): | ||||
|         if isinstance(obj, set): | ||||
|             return list(obj) | ||||
|         return super().default(obj) | ||||
|  | ||||
| # ---------------------------------------------------------------------- | ||||
| # 3. load data | ||||
| # ---------------------------------------------------------------------- | ||||
| df = pd.read_csv(args.input) | ||||
| X = df.iloc[:, :3].to_numpy() | ||||
| Y = df.iloc[:, 3].to_numpy() | ||||
|  | ||||
| print(f"dataset size: {len(X)}") | ||||
|  | ||||
| # ---------------------------------------------------------------------- | ||||
| # 4. train the tree | ||||
| # ---------------------------------------------------------------------- | ||||
| dt = DecisionTreeClassifier(max_depth=args.depth) | ||||
| dt.fit(X, Y) | ||||
| print("train accuracy (before nudging):", | ||||
|       accuracy_score(Y, dt.predict(X))) | ||||
|  | ||||
| if args.nudge: | ||||
|     nudged_tree = copy.deepcopy(dt.tree_) | ||||
|     apply_nudging(nudged_tree, 0, args.bits) | ||||
|     dt.tree_ = nudged_tree | ||||
|     print(f"nudging enabled, removed bottom {args.bits} bit(s) per threshold") | ||||
|  | ||||
|     print("train accuracy (after  nudging):", | ||||
|         accuracy_score(Y, dt.predict(X))) | ||||
|  | ||||
| # ---------------------------------------------------------------------- | ||||
| # 5. export | ||||
| # ---------------------------------------------------------------------- | ||||
| lineage = get_lineage(dt, df.columns[:3]) | ||||
|  | ||||
| output_path = Path(args.output) | ||||
| output_path.write_text(json.dumps(lineage, indent=4, cls=SetEncoder)) | ||||
| print(f"Wrote tree to {output_path.resolve()}") | ||||
							
								
								
									
										7
									
								
								run/print.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								run/print.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,7 @@ | ||||
| import json | ||||
| from pathlib import Path | ||||
|  | ||||
| for file in Path("results/compressed_tree/").glob("*.json"): | ||||
|     with open(file, "r") as f: | ||||
|         s = json.load(f) | ||||
|         print(max(s["paths"])+1) | ||||
							
								
								
									
										36
									
								
								run/rmt.bat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								run/rmt.bat
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| @echo off | ||||
| REM ------------------------------------------------------------- | ||||
| REM Batch-script to evaluate all compressed trees with every mode | ||||
| REM ------------------------------------------------------------- | ||||
| setlocal EnableDelayedExpansion | ||||
|  | ||||
| REM --- where the trees live and where to store results ---------- | ||||
| set TREEDIR=results\compressed_tree | ||||
| set OUTDIR=results\rmt | ||||
|  | ||||
| REM --- python executable (adjust if needed) --------------------- | ||||
| set PY=python | ||||
|  | ||||
| REM --- which modes to run -------------------------------------- | ||||
| set MODELIST=naive priority | ||||
| REM ------------------------------------------------------------- | ||||
|  | ||||
| if not exist "%OUTDIR%" mkdir "%OUTDIR%" | ||||
|  | ||||
| for %%F in ("%TREEDIR%\*.json") do ( | ||||
|     REM strip path → get file name without extension | ||||
|     set BASE=%%~nF | ||||
|  | ||||
|     for %%M in (%MODELIST%) do ( | ||||
|         echo Processing %%~nxF with mode %%M | ||||
|  | ||||
|         "%PY%" tree_to_rmt.py ^ | ||||
|             --mode %%M ^ | ||||
|             --input "%%F" ^ | ||||
|             --output "%OUTDIR%\!BASE!_%%M.json" | ||||
|  | ||||
|     ) | ||||
| ) | ||||
|  | ||||
| echo All runs complete. | ||||
| pause | ||||
							
								
								
									
										362
									
								
								run/rmt.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										362
									
								
								run/rmt.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,362 @@ | ||||
| Processing compressed_tree_d10_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d10_b0_naive.json | ||||
| TCAM bits: 30336 | ||||
| RAM bits:  6888 | ||||
| Processing compressed_tree_d10_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d10_b0_priority.json | ||||
| TCAM bits: 26648 | ||||
| RAM bits:  6888 | ||||
| Processing compressed_tree_d10_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d10_b1_naive.json | ||||
| TCAM bits: 29936 | ||||
| RAM bits:  6531 | ||||
| Processing compressed_tree_d10_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d10_b1_priority.json | ||||
| TCAM bits: 27120 | ||||
| RAM bits:  6531 | ||||
| Processing compressed_tree_d10_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d10_b3_naive.json | ||||
| TCAM bits: 21712 | ||||
| RAM bits:  5649 | ||||
| Processing compressed_tree_d10_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d10_b3_priority.json | ||||
| TCAM bits: 20048 | ||||
| RAM bits:  5649 | ||||
| Processing compressed_tree_d11_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d11_b0_naive.json | ||||
| TCAM bits: 41248 | ||||
| RAM bits:  10332 | ||||
| Processing compressed_tree_d11_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d11_b0_priority.json | ||||
| TCAM bits: 37592 | ||||
| RAM bits:  10332 | ||||
| Processing compressed_tree_d11_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d11_b1_naive.json | ||||
| TCAM bits: 41072 | ||||
| RAM bits:  9744 | ||||
| Processing compressed_tree_d11_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d11_b1_priority.json | ||||
| TCAM bits: 38256 | ||||
| RAM bits:  9744 | ||||
| Processing compressed_tree_d11_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d11_b3_naive.json | ||||
| TCAM bits: 28464 | ||||
| RAM bits:  8190 | ||||
| Processing compressed_tree_d11_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d11_b3_priority.json | ||||
| TCAM bits: 26928 | ||||
| RAM bits:  8190 | ||||
| Processing compressed_tree_d12_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d12_b0_naive.json | ||||
| TCAM bits: 55680 | ||||
| RAM bits:  15393 | ||||
| Processing compressed_tree_d12_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d12_b0_priority.json | ||||
| TCAM bits: 51592 | ||||
| RAM bits:  15393 | ||||
| Processing compressed_tree_d12_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d12_b1_naive.json | ||||
| TCAM bits: 54240 | ||||
| RAM bits:  14175 | ||||
| Processing compressed_tree_d12_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d12_b1_priority.json | ||||
| TCAM bits: 51200 | ||||
| RAM bits:  14175 | ||||
| Processing compressed_tree_d12_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d12_b3_naive.json | ||||
| TCAM bits: 36048 | ||||
| RAM bits:  11361 | ||||
| Processing compressed_tree_d12_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d12_b3_priority.json | ||||
| TCAM bits: 34416 | ||||
| RAM bits:  11361 | ||||
| Processing compressed_tree_d13_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d13_b0_naive.json | ||||
| TCAM bits: 73152 | ||||
| RAM bits:  22680 | ||||
| Processing compressed_tree_d13_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d13_b0_priority.json | ||||
| TCAM bits: 69096 | ||||
| RAM bits:  22680 | ||||
| Processing compressed_tree_d13_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d13_b1_naive.json | ||||
| TCAM bits: 71024 | ||||
| RAM bits:  20643 | ||||
| Processing compressed_tree_d13_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d13_b1_priority.json | ||||
| TCAM bits: 68160 | ||||
| RAM bits:  20643 | ||||
| Processing compressed_tree_d13_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d13_b3_naive.json | ||||
| TCAM bits: 45152 | ||||
| RAM bits:  16002 | ||||
| Processing compressed_tree_d13_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d13_b3_priority.json | ||||
| TCAM bits: 43600 | ||||
| RAM bits:  16002 | ||||
| Processing compressed_tree_d14_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d14_b0_naive.json | ||||
| TCAM bits: 95760 | ||||
| RAM bits:  33012 | ||||
| Processing compressed_tree_d14_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d14_b0_priority.json | ||||
| TCAM bits: 91656 | ||||
| RAM bits:  33012 | ||||
| Processing compressed_tree_d14_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d14_b1_naive.json | ||||
| TCAM bits: 93520 | ||||
| RAM bits:  29862 | ||||
| Processing compressed_tree_d14_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d14_b1_priority.json | ||||
| TCAM bits: 90544 | ||||
| RAM bits:  29862 | ||||
| Processing compressed_tree_d14_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d14_b3_naive.json | ||||
| TCAM bits: 56144 | ||||
| RAM bits:  21819 | ||||
| Processing compressed_tree_d14_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d14_b3_priority.json | ||||
| TCAM bits: 54544 | ||||
| RAM bits:  21819 | ||||
| Processing compressed_tree_d15_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d15_b0_naive.json | ||||
| TCAM bits: 122496 | ||||
| RAM bits:  46662 | ||||
| Processing compressed_tree_d15_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d15_b0_priority.json | ||||
| TCAM bits: 118792 | ||||
| RAM bits:  46662 | ||||
| Processing compressed_tree_d15_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d15_b1_naive.json | ||||
| TCAM bits: 118640 | ||||
| RAM bits:  41349 | ||||
| Processing compressed_tree_d15_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d15_b1_priority.json | ||||
| TCAM bits: 115984 | ||||
| RAM bits:  41349 | ||||
| Processing compressed_tree_d15_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d15_b3_naive.json | ||||
| TCAM bits: 68928 | ||||
| RAM bits:  28875 | ||||
| Processing compressed_tree_d15_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d15_b3_priority.json | ||||
| TCAM bits: 67328 | ||||
| RAM bits:  28875 | ||||
| Processing compressed_tree_d1_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d1_b0_naive.json | ||||
| TCAM bits: 256 | ||||
| RAM bits:  42 | ||||
| Processing compressed_tree_d1_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d1_b0_priority.json | ||||
| TCAM bits: 128 | ||||
| RAM bits:  42 | ||||
| Processing compressed_tree_d1_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d1_b1_naive.json | ||||
| TCAM bits: 256 | ||||
| RAM bits:  42 | ||||
| Processing compressed_tree_d1_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d1_b1_priority.json | ||||
| TCAM bits: 144 | ||||
| RAM bits:  42 | ||||
| Processing compressed_tree_d1_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d1_b3_naive.json | ||||
| TCAM bits: 240 | ||||
| RAM bits:  42 | ||||
| Processing compressed_tree_d1_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d1_b3_priority.json | ||||
| TCAM bits: 128 | ||||
| RAM bits:  42 | ||||
| Processing compressed_tree_d2_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d2_b0_naive.json | ||||
| TCAM bits: 592 | ||||
| RAM bits:  105 | ||||
| Processing compressed_tree_d2_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d2_b0_priority.json | ||||
| TCAM bits: 288 | ||||
| RAM bits:  105 | ||||
| Processing compressed_tree_d2_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d2_b1_naive.json | ||||
| TCAM bits: 592 | ||||
| RAM bits:  105 | ||||
| Processing compressed_tree_d2_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d2_b1_priority.json | ||||
| TCAM bits: 320 | ||||
| RAM bits:  105 | ||||
| Processing compressed_tree_d2_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d2_b3_naive.json | ||||
| TCAM bits: 544 | ||||
| RAM bits:  105 | ||||
| Processing compressed_tree_d2_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d2_b3_priority.json | ||||
| TCAM bits: 288 | ||||
| RAM bits:  105 | ||||
| Processing compressed_tree_d3_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d3_b0_naive.json | ||||
| TCAM bits: 1120 | ||||
| RAM bits:  210 | ||||
| Processing compressed_tree_d3_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d3_b0_priority.json | ||||
| TCAM bits: 640 | ||||
| RAM bits:  210 | ||||
| Processing compressed_tree_d3_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d3_b1_naive.json | ||||
| TCAM bits: 1120 | ||||
| RAM bits:  210 | ||||
| Processing compressed_tree_d3_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d3_b1_priority.json | ||||
| TCAM bits: 680 | ||||
| RAM bits:  210 | ||||
| Processing compressed_tree_d3_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d3_b3_naive.json | ||||
| TCAM bits: 944 | ||||
| RAM bits:  210 | ||||
| Processing compressed_tree_d3_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d3_b3_priority.json | ||||
| TCAM bits: 576 | ||||
| RAM bits:  210 | ||||
| Processing compressed_tree_d4_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d4_b0_naive.json | ||||
| TCAM bits: 1880 | ||||
| RAM bits:  357 | ||||
| Processing compressed_tree_d4_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d4_b0_priority.json | ||||
| TCAM bits: 1128 | ||||
| RAM bits:  357 | ||||
| Processing compressed_tree_d4_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d4_b1_naive.json | ||||
| TCAM bits: 1880 | ||||
| RAM bits:  357 | ||||
| Processing compressed_tree_d4_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d4_b1_priority.json | ||||
| TCAM bits: 1208 | ||||
| RAM bits:  357 | ||||
| Processing compressed_tree_d4_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d4_b3_naive.json | ||||
| TCAM bits: 1632 | ||||
| RAM bits:  336 | ||||
| Processing compressed_tree_d4_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d4_b3_priority.json | ||||
| TCAM bits: 1024 | ||||
| RAM bits:  336 | ||||
| Processing compressed_tree_d5_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d5_b0_naive.json | ||||
| TCAM bits: 3608 | ||||
| RAM bits:  609 | ||||
| Processing compressed_tree_d5_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d5_b0_priority.json | ||||
| TCAM bits: 2200 | ||||
| RAM bits:  609 | ||||
| Processing compressed_tree_d5_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d5_b1_naive.json | ||||
| TCAM bits: 3608 | ||||
| RAM bits:  609 | ||||
| Processing compressed_tree_d5_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d5_b1_priority.json | ||||
| TCAM bits: 2376 | ||||
| RAM bits:  609 | ||||
| Processing compressed_tree_d5_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d5_b3_naive.json | ||||
| TCAM bits: 2704 | ||||
| RAM bits:  546 | ||||
| Processing compressed_tree_d5_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d5_b3_priority.json | ||||
| TCAM bits: 1824 | ||||
| RAM bits:  546 | ||||
| Processing compressed_tree_d6_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d6_b0_naive.json | ||||
| TCAM bits: 6440 | ||||
| RAM bits:  1134 | ||||
| Processing compressed_tree_d6_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d6_b0_priority.json | ||||
| TCAM bits: 4512 | ||||
| RAM bits:  1134 | ||||
| Processing compressed_tree_d6_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d6_b1_naive.json | ||||
| TCAM bits: 6440 | ||||
| RAM bits:  1134 | ||||
| Processing compressed_tree_d6_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d6_b1_priority.json | ||||
| TCAM bits: 4776 | ||||
| RAM bits:  1134 | ||||
| Processing compressed_tree_d6_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d6_b3_naive.json | ||||
| TCAM bits: 4832 | ||||
| RAM bits:  1008 | ||||
| Processing compressed_tree_d6_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d6_b3_priority.json | ||||
| TCAM bits: 3648 | ||||
| RAM bits:  1008 | ||||
| Processing compressed_tree_d7_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d7_b0_naive.json | ||||
| TCAM bits: 10344 | ||||
| RAM bits:  1848 | ||||
| Processing compressed_tree_d7_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d7_b0_priority.json | ||||
| TCAM bits: 7808 | ||||
| RAM bits:  1848 | ||||
| Processing compressed_tree_d7_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d7_b1_naive.json | ||||
| TCAM bits: 10312 | ||||
| RAM bits:  1806 | ||||
| Processing compressed_tree_d7_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d7_b1_priority.json | ||||
| TCAM bits: 8136 | ||||
| RAM bits:  1806 | ||||
| Processing compressed_tree_d7_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d7_b3_naive.json | ||||
| TCAM bits: 7760 | ||||
| RAM bits:  1596 | ||||
| Processing compressed_tree_d7_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d7_b3_priority.json | ||||
| TCAM bits: 6352 | ||||
| RAM bits:  1596 | ||||
| Processing compressed_tree_d8_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d8_b0_naive.json | ||||
| TCAM bits: 15672 | ||||
| RAM bits:  3003 | ||||
| Processing compressed_tree_d8_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d8_b0_priority.json | ||||
| TCAM bits: 12640 | ||||
| RAM bits:  3003 | ||||
| Processing compressed_tree_d8_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d8_b1_naive.json | ||||
| TCAM bits: 15576 | ||||
| RAM bits:  2919 | ||||
| Processing compressed_tree_d8_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d8_b1_priority.json | ||||
| TCAM bits: 13160 | ||||
| RAM bits:  2919 | ||||
| Processing compressed_tree_d8_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d8_b3_naive.json | ||||
| TCAM bits: 11504 | ||||
| RAM bits:  2625 | ||||
| Processing compressed_tree_d8_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d8_b3_priority.json | ||||
| TCAM bits: 10016 | ||||
| RAM bits:  2625 | ||||
| Processing compressed_tree_d9_b0.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d9_b0_naive.json | ||||
| TCAM bits: 22640 | ||||
| RAM bits:  4662 | ||||
| Processing compressed_tree_d9_b0.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d9_b0_priority.json | ||||
| TCAM bits: 18936 | ||||
| RAM bits:  4662 | ||||
| Processing compressed_tree_d9_b1.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d9_b1_naive.json | ||||
| TCAM bits: 22784 | ||||
| RAM bits:  4557 | ||||
| Processing compressed_tree_d9_b1.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d9_b1_priority.json | ||||
| TCAM bits: 19872 | ||||
| RAM bits:  4557 | ||||
| Processing compressed_tree_d9_b3.json with mode naive | ||||
| Output written to results\rmt\compressed_tree_d9_b3_naive.json | ||||
| TCAM bits: 16560 | ||||
| RAM bits:  3948 | ||||
| Processing compressed_tree_d9_b3.json with mode priority | ||||
| Output written to results\rmt\compressed_tree_d9_b3_priority.json | ||||
| TCAM bits: 14880 | ||||
| RAM bits:  3948 | ||||
| All runs complete. | ||||
| Press any key to continue . . .  | ||||
							
								
								
									
										24
									
								
								run/run.bat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								run/run.bat
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| @echo off | ||||
| REM --- settings -------------------------------------------------------- | ||||
| set INPUT=..\data\combined\data.csv | ||||
| set OUTDIR=results\tree | ||||
| set DEPTH_LIST=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | ||||
| set BITS_LIST=0 1 3 | ||||
| set PY=python | ||||
| REM --------------------------------------------------------------------- | ||||
|  | ||||
| if not exist "%OUTDIR%" mkdir "%OUTDIR%" | ||||
|  | ||||
| for %%D in (%DEPTH_LIST%) do ( | ||||
|     for %%B in (%BITS_LIST%) do ( | ||||
|         echo Running depth=%%D bits=%%B | ||||
|         %PY% decision_tree.py ^ | ||||
|             --input "%INPUT%" ^ | ||||
|             --output "%OUTDIR%\tree_d%%D_b%%B.json" ^ | ||||
|             --depth %%D ^ | ||||
|             --nudge --bits %%B | ||||
|     ) | ||||
| ) | ||||
|  | ||||
| echo All runs complete | ||||
| pause | ||||
							
								
								
									
										272
									
								
								run/run.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										272
									
								
								run/run.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,272 @@ | ||||
| Running depth=1 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6249802762830571 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6249802762830571 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b0.json | ||||
| Running depth=1 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6249802762830571 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6249802762830571 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b1.json | ||||
| Running depth=1 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6249802762830571 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6249802762830571 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d1_b3.json | ||||
| Running depth=2 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6329657127591488 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6329657127591488 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b0.json | ||||
| Running depth=2 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6329657127591488 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.632965582569598 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b1.json | ||||
| Running depth=2 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6329657127591488 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.632991490290203 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d2_b3.json | ||||
| Running depth=3 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6770542739406867 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6770542739406867 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b0.json | ||||
| Running depth=3 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6770542739406867 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6770412549856089 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b1.json | ||||
| Running depth=3 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.6770542739406867 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.6785083610333301 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d3_b3.json | ||||
| Running depth=4 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.7785798611346175 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7785798611346175 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b0.json | ||||
| Running depth=4 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.7785798611346175 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7762147075656273 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b1.json | ||||
| Running depth=4 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.7785798611346175 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7764365505601536 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d4_b3.json | ||||
| Running depth=5 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8410252791654538 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8410252791654538 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b0.json | ||||
| Running depth=5 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8410252791654538 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.834092425207405 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b1.json | ||||
| Running depth=5 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8410252791654538 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.772544924508287 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d5_b3.json | ||||
| Running depth=6 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8646269522574087 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8646269522574087 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b0.json | ||||
| Running depth=6 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8646269522574087 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8576925360247506 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b1.json | ||||
| Running depth=6 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8646269522574087 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.794651761178205 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d6_b3.json | ||||
| Running depth=7 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8806056365826389 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8806056365826389 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b0.json | ||||
| Running depth=7 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8806056365826389 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8736095105029118 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b1.json | ||||
| Running depth=7 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8806056365826389 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7695685309983924 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d7_b3.json | ||||
| Running depth=8 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8930218140403702 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8930218140403702 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b0.json | ||||
| Running depth=8 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8930218140403702 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8853817704424934 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b1.json | ||||
| Running depth=8 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.8930218140403702 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7773965683075931 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d8_b3.json | ||||
| Running depth=9 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9065990219119429 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9065990219119429 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b0.json | ||||
| Running depth=9 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9065990219119429 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.8971600191014109 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b1.json | ||||
| Running depth=9 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9065990219119429 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7901483744272311 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d9_b3.json | ||||
| Running depth=10 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9131070673658019 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9131070673658019 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b0.json | ||||
| Running depth=10 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9131070673658019 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9012124292484887 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b1.json | ||||
| Running depth=10 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9131070673658019 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7823837394292594 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d10_b3.json | ||||
| Running depth=11 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9167131877328115 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9167131877328115 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b0.json | ||||
| Running depth=11 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9167131877328115 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9033505322409215 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b1.json | ||||
| Running depth=11 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9167131877328115 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7834850128392935 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d11_b3.json | ||||
| Running depth=12 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9190772997853955 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9190772997853955 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b0.json | ||||
| Running depth=12 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9190772997853955 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9050692946902973 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b1.json | ||||
| Running depth=12 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9190772997853955 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7733082258445005 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d12_b3.json | ||||
| Running depth=13 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9210431620021486 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9210431620021486 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b0.json | ||||
| Running depth=13 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9210431620021486 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9069113466442602 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b1.json | ||||
| Running depth=13 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9210431620021486 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7656775558942799 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d13_b3.json | ||||
| Running depth=14 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9232170671210456 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9232170671210456 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b0.json | ||||
| Running depth=14 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9232169369314948 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9071005120615411 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b1.json | ||||
| Running depth=14 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9232170671210456 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.7649352150757417 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d14_b3.json | ||||
| Running depth=15 bits=0 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9249752770043072 | ||||
| nudging enabled, removed bottom 0 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.9249752770043072 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b0.json | ||||
| Running depth=15 bits=1 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9249752770043072 | ||||
| nudging enabled, removed bottom 1 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.908089692268355 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b1.json | ||||
| Running depth=15 bits=3 | ||||
| dataset size: 7681108 | ||||
| train accuracy (before nudging): 0.9249752770043072 | ||||
| nudging enabled, removed bottom 3 bit(s) per threshold | ||||
| train accuracy (after  nudging): 0.762985496363285 | ||||
| Wrote tree to C:\Users\jaipa\CS\cs216\IdealRMT-DecisionTrees\run\results\tree\tree_d15_b3.json | ||||
| All runs complete | ||||
| Press any key to continue . . .  | ||||
							
								
								
									
										173
									
								
								run/tree_compress.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										173
									
								
								run/tree_compress.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,173 @@ | ||||
| #!/usr/bin/env python3 | ||||
| """Batch‑compress decision‑tree JSON files. | ||||
|  | ||||
| This script preserves the original logic but loops over every *.json file | ||||
| in results/tree and drops a corresponding compressed file in | ||||
| results/compressed_tree. | ||||
|  | ||||
| Example: | ||||
|     $ python compress_trees_batch.py | ||||
| """ | ||||
|  | ||||
| from __future__ import annotations | ||||
|  | ||||
| import json | ||||
| import math | ||||
| import os | ||||
| from collections import defaultdict | ||||
| from pathlib import Path | ||||
|  | ||||
| INPUT_DIR = Path("results/tree") | ||||
| OUTPUT_DIR = Path("results/compressed_tree") | ||||
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
|  | ||||
| class SetEncoder(json.JSONEncoder): | ||||
|     def default(self, obj):  # type: ignore[override] | ||||
|         if isinstance(obj, set): | ||||
|             return list(obj) | ||||
|         return super().default(obj) | ||||
|  | ||||
|  | ||||
| # helper function given a range and value x returns if x is in the range | ||||
|  | ||||
| def is_in_range(x: int, lower: int | None, upper: int | None) -> bool:  # noqa: N803 | ||||
|     if lower is None and upper is None: | ||||
|         return True | ||||
|     if lower is None: | ||||
|         return x <= upper  # type: ignore[operator] | ||||
|     if upper is None: | ||||
|         return x > lower | ||||
|     return x <= upper and x > lower  # type: ignore[operator] | ||||
|  | ||||
|  | ||||
| for tree_path in INPUT_DIR.glob("*.json"): | ||||
|     with tree_path.open() as f: | ||||
|         tree = json.load(f) | ||||
|  | ||||
|     paths = tree["paths"] | ||||
|  | ||||
|     # First cleanup the tree by rounding the decision points to integer values | ||||
|     path_ids: set[int] = set() | ||||
|     path_classes = tree["classes"] | ||||
|  | ||||
|     # assign ids and round thresholds | ||||
|     for idx, path in enumerate(paths): | ||||
|         path["id"] = idx | ||||
|         path_ids.add(idx) | ||||
|         for condition in path["conditions"]: | ||||
|             operation = condition["operation"] | ||||
|             if operation == "<=": | ||||
|                 condition["value"] = math.floor(condition["value"]) | ||||
|             else: | ||||
|                 condition["value"] = math.floor(condition["value"]) | ||||
|  | ||||
|     # Find all breakpoints for each feature and create a set of disjoint ranges | ||||
|     breakpoints: dict[str, list[int]] = defaultdict(set)  # type: ignore[assignment] | ||||
|     for path in paths: | ||||
|         for condition in path["conditions"]: | ||||
|             feature = condition["feature"] | ||||
|             value = condition["value"] | ||||
|             breakpoints[feature].add(value) | ||||
|  | ||||
|     # sort breakpoint lists | ||||
|     for feature in breakpoints: | ||||
|         points = list(breakpoints[feature]) | ||||
|         points.sort() | ||||
|         breakpoints[feature] = points  # type: ignore[assignment] | ||||
|  | ||||
|     # collapse all paths to ranges for each feature | ||||
|     for path in paths: | ||||
|         compressed: dict[str, dict[str, int | None]] = {} | ||||
|         for feature in breakpoints: | ||||
|             compressed[feature] = {"min": None, "max": None} | ||||
|  | ||||
|         for condition in path["conditions"]: | ||||
|             feature = condition["feature"] | ||||
|             operation = condition["operation"] | ||||
|             value = condition["value"] | ||||
|             if operation == "<=" and compressed[feature]["max"] is None: | ||||
|                 compressed[feature]["max"] = value | ||||
|             elif operation == ">" and compressed[feature]["min"] is None: | ||||
|                 compressed[feature]["min"] = value | ||||
|             elif operation == "<=" and value < compressed[feature]["max"]:  # type: ignore[operator] | ||||
|                 compressed[feature]["max"] = value | ||||
|             elif operation == ">" and value > compressed[feature]["min"]:  # type: ignore[operator] | ||||
|                 compressed[feature]["min"] = value | ||||
|  | ||||
|         path["compressed"] = compressed | ||||
|  | ||||
|     # create buckets for each feature, where each is a list of sets | ||||
|     buckets_id: dict[str, list[set[int]]] = {} | ||||
|     buckets_class: dict[str, list[set[str]]] = {} | ||||
|     for feature in breakpoints: | ||||
|         num_points = len(breakpoints[feature]) | ||||
|         buckets_id[feature] = [set() for _ in range(num_points + 1)] | ||||
|         buckets_class[feature] = [set() for _ in range(num_points + 1)] | ||||
|  | ||||
|     # fill buckets | ||||
|     for path in paths: | ||||
|         for feature_name, feature in path["compressed"].items(): | ||||
|             lower = feature["min"] | ||||
|             upper = feature["max"] | ||||
|             pid = path["id"] | ||||
|             cls = path["classification"] | ||||
|  | ||||
|             for idx, bp in enumerate(breakpoints[feature_name]): | ||||
|                 if is_in_range(bp, lower, upper): | ||||
|                     buckets_id[feature_name][idx].add(pid) | ||||
|                     buckets_class[feature_name][idx].add(cls) | ||||
|             # last bucket (> last breakpoint) | ||||
|             if is_in_range(bp + 1, lower, upper): | ||||
|                 buckets_id[feature_name][-1].add(pid) | ||||
|                 buckets_class[feature_name][-1].add(cls) | ||||
|  | ||||
|     # combine breakpoints and buckets to one representation | ||||
|     compressed_layers: dict[str, list[dict[str, object]]] = defaultdict(list) | ||||
|     for feature_name in buckets_id: | ||||
|         lower = None | ||||
|         upper = breakpoints[feature_name][0] | ||||
|         compressed_layers[feature_name].append( | ||||
|             { | ||||
|                 "min": lower, | ||||
|                 "max": upper, | ||||
|                 "paths": buckets_id[feature_name][0], | ||||
|                 "classes": buckets_class[feature_name][0], | ||||
|             } | ||||
|         ) | ||||
|         for i in range(1, len(buckets_id[feature_name]) - 1): | ||||
|             lower = breakpoints[feature_name][i - 1] | ||||
|             upper = breakpoints[feature_name][i] | ||||
|             compressed_layers[feature_name].append( | ||||
|                 { | ||||
|                     "min": lower, | ||||
|                     "max": upper, | ||||
|                     "paths": buckets_id[feature_name][i], | ||||
|                     "classes": buckets_class[feature_name][i], | ||||
|                 } | ||||
|             ) | ||||
|         lower = breakpoints[feature_name][-1] | ||||
|         upper = None | ||||
|         compressed_layers[feature_name].append( | ||||
|             { | ||||
|                 "min": lower, | ||||
|                 "max": upper, | ||||
|                 "paths": buckets_id[feature_name][-1], | ||||
|                 "classes": buckets_class[feature_name][-1], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|     path_to_class = {path["id"]: path["classification"] for path in paths} | ||||
|  | ||||
|     compressed_tree = { | ||||
|         "paths": list(path_ids), | ||||
|         "classes": path_classes, | ||||
|         "layers": compressed_layers, | ||||
|         "path_to_class": path_to_class, | ||||
|     } | ||||
|  | ||||
|     out_path = OUTPUT_DIR / tree_path.name.replace("tree", "compressed_tree") | ||||
|     with out_path.open("w") as f_out: | ||||
|         json.dump(compressed_tree, f_out, indent=4, cls=SetEncoder) | ||||
|  | ||||
|     # print(f"Wrote {out_path.relative_to(Path.cwd())}") | ||||
							
								
								
									
										279
									
								
								run/tree_to_rmt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										279
									
								
								run/tree_to_rmt.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,279 @@ | ||||
| #!/usr/bin/env python3 | ||||
| """Range‑to‑Prefix evaluation tool | ||||
|  | ||||
| This script keeps the original logic intact while letting you choose | ||||
| which expansion strategy to run via a command‑line flag. | ||||
|  | ||||
| Example: | ||||
|     $ python rmt_selectable.py --mode naive | ||||
|     $ python rmt_selectable.py --mode priority --input mytree.json --output result.json | ||||
| """ | ||||
|  | ||||
| import argparse | ||||
| import json | ||||
| import math | ||||
| import sys | ||||
| from pathlib import Path | ||||
|  | ||||
| # --------------------------------------------------------------------------- | ||||
| # Static configuration | ||||
| # --------------------------------------------------------------------------- | ||||
| field_width = { | ||||
|     "src": 16, | ||||
|     "dst": 16, | ||||
|     "protocol": 8, | ||||
| } | ||||
|  | ||||
| # --------------------------------------------------------------------------- | ||||
| # Helper routines (unchanged) | ||||
| # --------------------------------------------------------------------------- | ||||
|  | ||||
| def int_to_bin(i, width): | ||||
|     return bin(i)[2:].zfill(width) | ||||
|  | ||||
|  | ||||
| def increment_dc(pfx): | ||||
|     idx = pfx.find("*") | ||||
|     if idx == -1: | ||||
|         idx = len(pfx) | ||||
|     idx -= 1 | ||||
|     return pfx[:idx] + "*" + pfx[idx + 1 :] | ||||
|  | ||||
|  | ||||
| def can_merge(pfx_a, pfx_b): | ||||
|     pfx_a = pfx_a.replace("*", "") | ||||
|     pfx_b = pfx_b.replace("*", "") | ||||
|     return pfx_a[:-1] == pfx_b[:-1] and pfx_a[-1] != pfx_b[-1] | ||||
|  | ||||
|  | ||||
| def merge(pfx_a, prefixes): | ||||
|     pfx_a = increment_dc(pfx_a) | ||||
|     prefixes[-1] = pfx_a | ||||
|  | ||||
|     for i in range(len(prefixes) - 2, -1, -1): | ||||
|         if can_merge(prefixes[i], prefixes[i + 1]): | ||||
|             prefixes.pop() | ||||
|             pfx = increment_dc(prefixes[i]) | ||||
|             prefixes[i] = pfx | ||||
|  | ||||
|  | ||||
| def convert_range(lower, upper, width): | ||||
|     prefixes = [] | ||||
|     prefix = int_to_bin(lower, width) | ||||
|     prefixes.append(prefix) | ||||
|     norm_upper = min(upper, 2 ** width - 1) | ||||
|     for i in range(lower + 1, norm_upper + 1): | ||||
|         prefix = int_to_bin(i, width) | ||||
|         if can_merge(prefix, prefixes[-1]): | ||||
|             merge(prefix, prefixes) | ||||
|         else: | ||||
|             prefixes.append(prefix) | ||||
|     return prefixes | ||||
|  | ||||
| # --------------------------------------------------------------------------- | ||||
| # RMT construction strategies (logic preserved) | ||||
| # --------------------------------------------------------------------------- | ||||
|  | ||||
| def worst_case_rmt(tree): | ||||
|     rmt = [] | ||||
|     step = 0 | ||||
|  | ||||
|     tcam_bits = 0 | ||||
|     ram_bits = 0 | ||||
|  | ||||
|     for layer in layers: | ||||
|         num_ranges = len(layers[layer]) | ||||
|         # assume that each range requires all of 2*k prefixes when performing prefix expansion | ||||
|         # therefore there are 2*k * R for R ranges and width k | ||||
|         num_prefixes = 2 * field_width[layer] * num_ranges | ||||
|         prefix_width = field_width[layer] | ||||
|  | ||||
|         tcam = { | ||||
|             "id": f"{layer}_range", | ||||
|             "step": step, | ||||
|             "match": "ternary", | ||||
|             "entries": num_prefixes, | ||||
|             "key_size": prefix_width, | ||||
|         } | ||||
|         tcam_bits += num_prefixes * prefix_width | ||||
|  | ||||
|         # assume basic pointer reuse for metadata storage | ||||
|         ram = { | ||||
|             "id": f"{layer}_meta", | ||||
|             "step": step, | ||||
|             "match": "exact", | ||||
|             "method": "index", | ||||
|             "key_size": math.ceil(math.log2(num_ranges)), | ||||
|             "data_size": len(classes), | ||||
|         } | ||||
|         ram_bits += num_ranges * len(classes) | ||||
|  | ||||
|         rmt.append(tcam) | ||||
|         rmt.append(ram) | ||||
|  | ||||
|         step += 1 | ||||
|  | ||||
|     return rmt, tcam_bits, ram_bits | ||||
|  | ||||
|  | ||||
| def naive_rmt(tree): | ||||
|     rmt = [] | ||||
|     step = 0 | ||||
|  | ||||
|     tcam_bits = 0 | ||||
|     ram_bits = 0 | ||||
|  | ||||
|     for layer in layers: | ||||
|         num_prefixes = 0 | ||||
|         prefix_width = field_width[layer] | ||||
|         # for each range in the layer, convert the ranges to prefixes using naive range expansion | ||||
|         for r in layers[layer]: | ||||
|             if r["min"] is None: | ||||
|                 r["min"] = 0 | ||||
|             elif r["max"] is None: | ||||
|                 r["max"] = 2 ** prefix_width | ||||
|             prefixes = convert_range(r["min"], r["max"], prefix_width) | ||||
|             r["prefixes"] = prefixes | ||||
|             num_prefixes += len(prefixes) | ||||
|             tcam_bits += len(prefixes) * prefix_width | ||||
|  | ||||
|         tcam = { | ||||
|             "id": f"{layer}_range", | ||||
|             "step": step, | ||||
|             "match": "ternary", | ||||
|             "entries": num_prefixes, | ||||
|             "key_size": prefix_width, | ||||
|             "ranges": layers[layer], | ||||
|         } | ||||
|  | ||||
|         num_ranges = len(layers[layer]) | ||||
|         # assume no pointer reuse for metadata storage | ||||
|         ram = { | ||||
|             "id": f"{layer}_meta", | ||||
|             "step": step, | ||||
|             "match": "exact", | ||||
|             "method": "index", | ||||
|             "key_size": math.ceil(math.log2(num_ranges)), | ||||
|             "data_size": len(classes), | ||||
|         } | ||||
|         ram_bits += num_ranges * len(classes) | ||||
|  | ||||
|         rmt.append(tcam) | ||||
|         rmt.append(ram) | ||||
|  | ||||
|         step += 1 | ||||
|  | ||||
|     return rmt, tcam_bits, ram_bits | ||||
|  | ||||
|  | ||||
| def priority_aware(tree): | ||||
|     rmt = [] | ||||
|     step = 0 | ||||
|  | ||||
|     tcam_bits = 0 | ||||
|     ram_bits = 0 | ||||
|  | ||||
|     for layer in layers: | ||||
|         num_prefixes = 0 | ||||
|         prefix_width = field_width[layer] | ||||
|         # for each range, run the regular prefix expansion, and also the prefix expansion setting the minimum to 0 | ||||
|         # then check which set of prefixes would be better | ||||
|         # we will assume the ranges are already disjoint and in the correct order | ||||
|         for r in layers[layer]: | ||||
|             if r["min"] is None: | ||||
|                 r["min"] = 0 | ||||
|             elif r["max"] is None: | ||||
|                 r["max"] = 2 ** prefix_width | ||||
|             regular_prefixes = convert_range(r["min"], r["max"], prefix_width) | ||||
|             zero_start_prefixes = convert_range(0, r["max"], prefix_width) | ||||
|  | ||||
|             if len(regular_prefixes) <= len(zero_start_prefixes): | ||||
|                 pfx_type = "exact" | ||||
|                 prefixes = regular_prefixes | ||||
|             else: | ||||
|                 pfx_type = "zero" | ||||
|                 prefixes = zero_start_prefixes | ||||
|  | ||||
|             r["prefixes"] = prefixes | ||||
|             r["prefix_type"] = pfx_type | ||||
|             num_prefixes += len(prefixes) | ||||
|             tcam_bits += len(prefixes) * prefix_width | ||||
|  | ||||
|         tcam = { | ||||
|             "id": f"{layer}_range", | ||||
|             "step": step, | ||||
|             "match": "ternary", | ||||
|             "entries": num_prefixes, | ||||
|             "key_size": prefix_width, | ||||
|             "ranges": layers[layer], | ||||
|         } | ||||
|  | ||||
|         num_ranges = len(layers[layer]) | ||||
|         # assume no pointer reuse for metadata storage | ||||
|         ram = { | ||||
|             "id": f"{layer}_meta", | ||||
|             "step": step, | ||||
|             "match": "exact", | ||||
|             "method": "index", | ||||
|             "key_size": math.ceil(math.log2(num_ranges)), | ||||
|             "data_size": len(classes), | ||||
|         } | ||||
|         ram_bits += num_ranges * len(classes) | ||||
|  | ||||
|         rmt.append(tcam) | ||||
|         rmt.append(ram) | ||||
|  | ||||
|         step += 1 | ||||
|  | ||||
|     return rmt, tcam_bits, ram_bits | ||||
|  | ||||
| # --------------------------------------------------------------------------- | ||||
| # Main entry point | ||||
| # --------------------------------------------------------------------------- | ||||
|  | ||||
| def parse_args() -> argparse.Namespace: | ||||
|     parser = argparse.ArgumentParser(description="Evaluate RMT memory usage for different range‑to‑prefix strategies.") | ||||
|     parser.add_argument("--mode", choices=["worst", "naive", "priority"], default="worst", help="Strategy to use") | ||||
|     parser.add_argument("--input", default="compressed_tree.json", help="Input tree JSON file") | ||||
|     parser.add_argument("--output", default=None, help="Output RMT JSON file (defaults to <mode>_rmt.json)") | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     args = parse_args() | ||||
|  | ||||
|     # Keep the original variable names so the functions stay unchanged | ||||
|     global layers, classes | ||||
|  | ||||
|     try: | ||||
|         with open(args.input) as f: | ||||
|             tree = json.load(f) | ||||
|     except FileNotFoundError: | ||||
|         sys.exit(f"Input file '{args.input}' not found.") | ||||
|  | ||||
|     layers = tree["layers"] | ||||
|     classes = tree["classes"] | ||||
|  | ||||
|     if args.mode == "worst": | ||||
|         rmt, tcam_bits, ram_bits = worst_case_rmt(tree) | ||||
|         default_out = "worst_case_rmt.json" | ||||
|     elif args.mode == "naive": | ||||
|         rmt, tcam_bits, ram_bits = naive_rmt(tree) | ||||
|         default_out = "naive_rmt.json" | ||||
|     else:  # priority | ||||
|         rmt, tcam_bits, ram_bits = priority_aware(tree) | ||||
|         default_out = "priority_aware.json" | ||||
|  | ||||
|     out_file = args.output or default_out | ||||
|  | ||||
|     with open(out_file, "w") as f: | ||||
|         json.dump(rmt, f, indent=4) | ||||
|  | ||||
|     #! command python3 ideal-rmt-simulator/sim.py {out_file} | ||||
|     print(f"Output written to {out_file}") | ||||
|     print(f"TCAM bits: {tcam_bits}") | ||||
|     print(f"RAM bits:  {ram_bits}") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										44
									
								
								sanity_check/csvdiff.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								sanity_check/csvdiff.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | ||||
| #!/usr/bin/env python3 | ||||
| """ | ||||
| csvdiff.py file1.csv file2.csv | ||||
| Streams both files; prints the first differing line or | ||||
| ‘No differences found’. Uses O(1) memory. | ||||
| """ | ||||
|  | ||||
| import sys | ||||
| from itertools import zip_longest | ||||
| from pathlib import Path | ||||
|  | ||||
| def open_checked(p: str): | ||||
|     print(p) | ||||
|     path = Path(p) | ||||
|     try: | ||||
|         return path.open("r", newline=""), path | ||||
|     except FileNotFoundError: | ||||
|         sys.exit(f"Error: {path} not found") | ||||
|  | ||||
| def human(n: int) -> str: | ||||
|     return f"{n:,}" | ||||
|  | ||||
| def main(a_path: str, b_path: str) -> None: | ||||
|     fa, a = open_checked(a_path) | ||||
|     fb, b = open_checked(b_path) | ||||
|  | ||||
|     with fa, fb: | ||||
|         for idx, (ra, rb) in enumerate(zip_longest(fa, fb), 1): | ||||
|             if ra != rb: | ||||
|                 print(f"Files differ at line {human(idx)}") | ||||
|                 if ra is None: | ||||
|                     print(f"{a} ended early") | ||||
|                 elif rb is None: | ||||
|                     print(f"{b} ended early") | ||||
|                 else: | ||||
|                     print(f"{a}: {ra.rstrip()}") | ||||
|                     print(f"{b}: {rb.rstrip()}") | ||||
|                 return | ||||
|     print("No differences found") | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     if len(sys.argv) != 3: | ||||
|         sys.exit("Usage: csvdiff.py file1.csv file2.csv") | ||||
|     main(sys.argv[1], sys.argv[2]) | ||||
							
								
								
									
										600
									
								
								sanity_check/data_visualization.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										600
									
								
								sanity_check/data_visualization.ipynb
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										206
									
								
								sanity_check/diversity_metrics.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										206
									
								
								sanity_check/diversity_metrics.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,206 @@ | ||||
| #!/usr/bin/env python3 | ||||
| """diversity_metrics.py (fast version) | ||||
|  | ||||
| Estimate how much diversity each CSV adds without building a giant in‑memory | ||||
| DataFrame.  Designed for IoT packet logs with millions of rows. | ||||
|  | ||||
| Quick summary printed as a GitHub‑style table (requires *tabulate*; falls back | ||||
| to pandas plain text). | ||||
|  | ||||
| Usage | ||||
| ----- | ||||
| python diversity_metrics.py path/to/processed_dir [-r] [--sample 50000] | ||||
|  | ||||
| Metrics | ||||
| ------- | ||||
| ΔEntropy  : change in Shannon entropy of *classification* counts | ||||
| ΔGini     : change in Gini impurity of the same counts | ||||
| χ² p      : Pearson χ² p‑value old vs new classification counts | ||||
| Jaccard   : similarity of unique (src,dst) pairs (0 → new pairs, 1 → no new) | ||||
| KS src p  : Kolmogorov–Smirnov p‑value, source‑port dist (uses sampling) | ||||
| KS dst p  : Kolmogorov–Smirnov p‑value, dest‑port  dist (uses sampling) | ||||
|  | ||||
| Speed tricks | ||||
| ------------ | ||||
| * No growing DataFrame; we keep Counters / sets / lists. | ||||
| * Ports for KS are *sampled* (default 50 k) to bound cost. | ||||
| * (src,dst) pairs are hashed to a 32‑bit int to reduce set overhead. | ||||
| * pandas reads via **pyarrow** engine when available. | ||||
| """ | ||||
|  | ||||
| import argparse | ||||
| from pathlib import Path | ||||
| from collections import Counter | ||||
| from typing import List, Set | ||||
|  | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from scipy.stats import chi2_contingency, ks_2samp, entropy | ||||
|  | ||||
| try: | ||||
|     from tabulate import tabulate | ||||
|     _USE_TABULATE = True | ||||
| except ImportError: | ||||
|     _USE_TABULATE = False | ||||
|  | ||||
| # ----------------------------------------------------------------------------- | ||||
| # Helper metrics | ||||
| # ----------------------------------------------------------------------------- | ||||
|  | ||||
| def shannon(counts: Counter) -> float: | ||||
|     total = sum(counts.values()) | ||||
|     if total == 0: | ||||
|         return 0.0 | ||||
|     p = np.fromiter(counts.values(), dtype=float) | ||||
|     p /= total | ||||
|     return entropy(p, base=2) | ||||
|  | ||||
|  | ||||
| def gini(counts: Counter) -> float: | ||||
|     total = sum(counts.values()) | ||||
|     if total == 0: | ||||
|         return 0.0 | ||||
|     return 1.0 - sum((n / total) ** 2 for n in counts.values()) | ||||
|  | ||||
|  | ||||
| def jaccard(a: Set[int], b: Set[int]) -> float: | ||||
|     if not a and not b: | ||||
|         return 1.0 | ||||
|     return len(a & b) / len(a | b) | ||||
|  | ||||
| # ----------------------------------------------------------------------------- | ||||
| # Core analysis | ||||
| # ----------------------------------------------------------------------------- | ||||
|  | ||||
| def analyse(csv_files: List[Path], sample_size: int): | ||||
|     """Return list of dicts with diversity metrics for each added file.""" | ||||
|  | ||||
|     # cumulative state (no big DataFrame!) | ||||
|     class_counter: Counter = Counter() | ||||
|     pair_hashes: Set[int] = set() | ||||
|     src_list: List[int] = [] | ||||
|     dst_list: List[int] = [] | ||||
|  | ||||
|     rows = [] | ||||
|  | ||||
|     for csv_path in csv_files: | ||||
|         df = pd.read_csv( | ||||
|             csv_path, | ||||
|             engine="pyarrow" if pd.__version__ >= "2" else "c",  # fast parse | ||||
|             usecols=["protocl", "src", "dst", "classfication"], | ||||
|             dtype={ | ||||
|                 "protocl": "uint16", | ||||
|                 "protocol": "uint16", | ||||
|                 "src": "uint16", | ||||
|                 "dst": "uint16", | ||||
|             }, | ||||
|         ) | ||||
|         # normalise column names | ||||
|         df.rename(columns={"protocl": "protocol", "classfication": "classification"}, inplace=True) | ||||
|  | ||||
|         # snapshot previous state | ||||
|         prev_class = class_counter.copy() | ||||
|         prev_pairs = pair_hashes.copy() | ||||
|         prev_src = np.asarray(src_list, dtype=np.uint16) | ||||
|         prev_dst = np.asarray(dst_list, dtype=np.uint16) | ||||
|  | ||||
|         # --- update cumulative structures ------------------------------------ | ||||
|         class_counter.update(df["classification"].value_counts().to_dict()) | ||||
|  | ||||
|         # hash (src,dst) into 32‑bit int to save memory | ||||
|         pair_ids = (df["src"].to_numpy(dtype=np.uint32) << np.uint32(16)) | \ | ||||
|             df["dst"].to_numpy(dtype=np.uint32) | ||||
|  | ||||
|  | ||||
|         # extend port lists (keep small ints) | ||||
|         src_list.extend(df["src"].tolist()) | ||||
|         dst_list.extend(df["dst"].tolist()) | ||||
|  | ||||
|         # --- metrics ---------------------------------------------------------- | ||||
|         # χ² classification | ||||
|         chi_p = np.nan | ||||
|         if prev_class: | ||||
|             all_classes = list(set(prev_class) | set(df["classification"].unique())) | ||||
|             old = [prev_class.get(c, 0) for c in all_classes] | ||||
|             new = [df["classification"].value_counts().get(c, 0) for c in all_classes] | ||||
|             _, chi_p, _, _ = chi2_contingency([old, new]) | ||||
|  | ||||
|         # entropy & gini deltas | ||||
|         delta_entropy = shannon(class_counter) - shannon(prev_class) | ||||
|         delta_gini = gini(class_counter) - gini(prev_class) | ||||
|  | ||||
|         # Jaccard on pair hashes | ||||
|         jc = jaccard(prev_pairs, pair_hashes) | ||||
|  | ||||
|         # KS tests on sampled ports | ||||
|         ks_src_p = ks_dst_p = np.nan | ||||
|         if prev_src.size: | ||||
|             new_src = df["src"].to_numpy(dtype=np.uint16) | ||||
|             new_dst = df["dst"].to_numpy(dtype=np.uint16) | ||||
|             if prev_src.size > sample_size: | ||||
|                 prev_src_sample = np.random.choice(prev_src, sample_size, replace=False) | ||||
|             else: | ||||
|                 prev_src_sample = prev_src | ||||
|             if new_src.size > sample_size: | ||||
|                 new_src_sample = np.random.choice(new_src, sample_size, replace=False) | ||||
|             else: | ||||
|                 new_src_sample = new_src | ||||
|             if prev_dst.size > sample_size: | ||||
|                 prev_dst_sample = np.random.choice(prev_dst, sample_size, replace=False) | ||||
|             else: | ||||
|                 prev_dst_sample = prev_dst | ||||
|             if new_dst.size > sample_size: | ||||
|                 new_dst_sample = np.random.choice(new_dst, sample_size, replace=False) | ||||
|             else: | ||||
|                 new_dst_sample = new_dst | ||||
|  | ||||
|             ks_src_p = ks_2samp(prev_src_sample, new_src_sample).pvalue | ||||
|             ks_dst_p = ks_2samp(prev_dst_sample, new_dst_sample).pvalue | ||||
|  | ||||
|         rows.append( | ||||
|             { | ||||
|                 "File": csv_path.name, | ||||
|                 "Rows": len(df), | ||||
|                 "ΔEntropy": round(delta_entropy, 4), | ||||
|                 "ΔGini": round(delta_gini, 4), | ||||
|                 "χ² p": f"{chi_p:.3g}" if not np.isnan(chi_p) else "NA", | ||||
|                 "Jaccard": round(jc, 3), | ||||
|                 "KS src p": f"{ks_src_p:.3g}" if not np.isnan(ks_src_p) else "NA", | ||||
|                 "KS dst p": f"{ks_dst_p:.3g}" if not np.isnan(ks_dst_p) else "NA", | ||||
|             } | ||||
|         ) | ||||
|     return rows | ||||
|  | ||||
| # ----------------------------------------------------------------------------- | ||||
| # CLI | ||||
| # ----------------------------------------------------------------------------- | ||||
|  | ||||
| def main(): | ||||
|     ap = argparse.ArgumentParser(description="Evaluate diversity contribution of each CSV (fast version).") | ||||
|     ap.add_argument("csv_dir", help="Directory containing CSV files") | ||||
|     ap.add_argument("-r", "--recursive", action="store_true", help="Recursively search csv_dir") | ||||
|     ap.add_argument("--sample", type=int, default=50_000, help="Sample size for KS tests (default 50k)") | ||||
|     args = ap.parse_args() | ||||
|  | ||||
|     root = Path(args.csv_dir) | ||||
|     pattern = "**/*.csv" if args.recursive else "*.csv" | ||||
|     csv_files = sorted(root.glob(pattern)) | ||||
|     if not csv_files: | ||||
|         print("No CSV files found.") | ||||
|         return | ||||
|  | ||||
|     table_rows = analyse(csv_files, args.sample) | ||||
|  | ||||
|     if _USE_TABULATE: | ||||
|         print(tabulate(table_rows, headers="keys", tablefmt="github", floatfmt=".4f")) | ||||
|     else: | ||||
|         print(pd.DataFrame(table_rows).to_string(index=False)) | ||||
|  | ||||
|     print( | ||||
|         "\nLegend:\n  • p-values (χ², KS) < 0.05 → new file significantly shifts distribution (GOOD)" | ||||
|         "\n  • Positive ΔEntropy or ΔGini → richer mix; near 0 → little new info" | ||||
|         "\n  • Jaccard close to 0 → many unseen (src,dst) pairs; close to 1 → redundant." | ||||
|     ) | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										14
									
								
								setup.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								setup.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| #!/usr/bin/env bash | ||||
| # Creates the directory layout: | ||||
| #   data/ | ||||
| #     tar/ | ||||
| #     pcap/ | ||||
| #     processed/ | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| root="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
|  | ||||
| mkdir -p "$root"/data/{tar,pcap,processed,combined} | ||||
|  | ||||
| echo "Directory structure ready under $root/data/" | ||||
		Reference in New Issue
	
	Block a user