Run data and code

2025-09-09 00:37:23 +00:00 · 2025-06-13 16:11:56 -07:00
parent 2ad40946d1
commit c8a0b18abf
8 changed files with 1321 additions and 0 deletions
--- a/run/tree_compress.py
+++ b/run/tree_compress.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Batch‑compress decision‑tree JSON files.
+
+This script preserves the original logic but loops over every *.json file
+in results/tree and drops a corresponding compressed file in
+results/compressed_tree.
+
+Example:
+    $ python compress_trees_batch.py
+"""
+
+from __future__ import annotations
+
+import json
+import math
+import os
+from collections import defaultdict
+from pathlib import Path
+
+INPUT_DIR = Path("results/tree")
+OUTPUT_DIR = Path("results/compressed_tree")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+class SetEncoder(json.JSONEncoder):
+    def default(self, obj):  # type: ignore[override]
+        if isinstance(obj, set):
+            return list(obj)
+        return super().default(obj)
+
+
+# helper function given a range and value x returns if x is in the range
+
+def is_in_range(x: int, lower: int | None, upper: int | None) -> bool:  # noqa: N803
+    if lower is None and upper is None:
+        return True
+    if lower is None:
+        return x <= upper  # type: ignore[operator]
+    if upper is None:
+        return x > lower
+    return x <= upper and x > lower  # type: ignore[operator]
+
+
+for tree_path in INPUT_DIR.glob("*.json"):
+    with tree_path.open() as f:
+        tree = json.load(f)
+
+    paths = tree["paths"]
+
+    # First cleanup the tree by rounding the decision points to integer values
+    path_ids: set[int] = set()
+    path_classes = tree["classes"]
+
+    # assign ids and round thresholds
+    for idx, path in enumerate(paths):
+        path["id"] = idx
+        path_ids.add(idx)
+        for condition in path["conditions"]:
+            operation = condition["operation"]
+            if operation == "<=":
+                condition["value"] = math.floor(condition["value"])
+            else:
+                condition["value"] = math.floor(condition["value"])
+
+    # Find all breakpoints for each feature and create a set of disjoint ranges
+    breakpoints: dict[str, list[int]] = defaultdict(set)  # type: ignore[assignment]
+    for path in paths:
+        for condition in path["conditions"]:
+            feature = condition["feature"]
+            value = condition["value"]
+            breakpoints[feature].add(value)
+
+    # sort breakpoint lists
+    for feature in breakpoints:
+        points = list(breakpoints[feature])
+        points.sort()
+        breakpoints[feature] = points  # type: ignore[assignment]
+
+    # collapse all paths to ranges for each feature
+    for path in paths:
+        compressed: dict[str, dict[str, int | None]] = {}
+        for feature in breakpoints:
+            compressed[feature] = {"min": None, "max": None}
+
+        for condition in path["conditions"]:
+            feature = condition["feature"]
+            operation = condition["operation"]
+            value = condition["value"]
+            if operation == "<=" and compressed[feature]["max"] is None:
+                compressed[feature]["max"] = value
+            elif operation == ">" and compressed[feature]["min"] is None:
+                compressed[feature]["min"] = value
+            elif operation == "<=" and value < compressed[feature]["max"]:  # type: ignore[operator]
+                compressed[feature]["max"] = value
+            elif operation == ">" and value > compressed[feature]["min"]:  # type: ignore[operator]
+                compressed[feature]["min"] = value
+
+        path["compressed"] = compressed
+
+    # create buckets for each feature, where each is a list of sets
+    buckets_id: dict[str, list[set[int]]] = {}
+    buckets_class: dict[str, list[set[str]]] = {}
+    for feature in breakpoints:
+        num_points = len(breakpoints[feature])
+        buckets_id[feature] = [set() for _ in range(num_points + 1)]
+        buckets_class[feature] = [set() for _ in range(num_points + 1)]
+
+    # fill buckets
+    for path in paths:
+        for feature_name, feature in path["compressed"].items():
+            lower = feature["min"]
+            upper = feature["max"]
+            pid = path["id"]
+            cls = path["classification"]
+
+            for idx, bp in enumerate(breakpoints[feature_name]):
+                if is_in_range(bp, lower, upper):
+                    buckets_id[feature_name][idx].add(pid)
+                    buckets_class[feature_name][idx].add(cls)
+            # last bucket (> last breakpoint)
+            if is_in_range(bp + 1, lower, upper):
+                buckets_id[feature_name][-1].add(pid)
+                buckets_class[feature_name][-1].add(cls)
+
+    # combine breakpoints and buckets to one representation
+    compressed_layers: dict[str, list[dict[str, object]]] = defaultdict(list)
+    for feature_name in buckets_id:
+        lower = None
+        upper = breakpoints[feature_name][0]
+        compressed_layers[feature_name].append(
+            {
+                "min": lower,
+                "max": upper,
+                "paths": buckets_id[feature_name][0],
+                "classes": buckets_class[feature_name][0],
+            }
+        )
+        for i in range(1, len(buckets_id[feature_name]) - 1):
+            lower = breakpoints[feature_name][i - 1]
+            upper = breakpoints[feature_name][i]
+            compressed_layers[feature_name].append(
+                {
+                    "min": lower,
+                    "max": upper,
+                    "paths": buckets_id[feature_name][i],
+                    "classes": buckets_class[feature_name][i],
+                }
+            )
+        lower = breakpoints[feature_name][-1]
+        upper = None
+        compressed_layers[feature_name].append(
+            {
+                "min": lower,
+                "max": upper,
+                "paths": buckets_id[feature_name][-1],
+                "classes": buckets_class[feature_name][-1],
+            }
+        )
+
+    path_to_class = {path["id"]: path["classification"] for path in paths}
+
+    compressed_tree = {
+        "paths": list(path_ids),
+        "classes": path_classes,
+        "layers": compressed_layers,
+        "path_to_class": path_to_class,
+    }
+
+    out_path = OUTPUT_DIR / tree_path.name.replace("tree", "compressed_tree")
+    with out_path.open("w") as f_out:
+        json.dump(compressed_tree, f_out, indent=4, cls=SetEncoder)
+
+    # print(f"Wrote {out_path.relative_to(Path.cwd())}")