IdealRMT-DecisionTrees/extract_all_datasets.py

#!/usr/bin/env python3
from pathlib import Path
import numpy as np
import pandas as pd
from labels import mac_to_label
from tqdm import tqdm
import os

ROOT       = Path(__file__).resolve().parent
PCAP_DIR   = ROOT / "data" / "pcap"
CSV_DIR    = ROOT / "data" / "processed"
CSV_DIR.mkdir(parents=True, exist_ok=True)

BATCH = 100_000   # packets per chunk

from scapy.all import rdpcap


def process_pcap(pcap_path: str, csv_path: str) -> None:
    all_packets = rdpcap(pcap_path)

    print("rdpcap done", flush=True)
    results = []
    for packet in tqdm(all_packets):
        size = len(packet)
        try:
            proto = packet.proto
        except AttributeError:
            proto = 0
        try:
            sport = packet.sport
            dport = packet.dport
        except AttributeError:
            sport = 0
            dport = 0

        proto = int(proto)
        sport = int(sport)
        dport = int(dport)

        if "Ether" in packet:
            eth_dst = packet["Ether"].dst
            if eth_dst in mac_to_label:
                classification = mac_to_label[eth_dst]
            else:
                classification = "other"
        else:
            classification = "other"

        metric = [proto,sport,dport,classification]
        results.append(metric)
    results = (np.array(results)).T

    # store the features in the dataframe
    dataframe = pd.DataFrame({'protocl':results[0],'src':results[1],'dst':results[2],'classfication':results[3]})
    columns = ['protocl','src','dst','classfication']

    # save the dataframe to the csv file, if not exsit, create one.
    if os.path.exists(csv_path):
        dataframe.to_csv(csv_path,index=False,sep=',',mode='a',columns = columns, header=False)
    else:
        dataframe.to_csv(csv_path,index=False,sep=',',columns = columns)

    print("Done")


def main() -> None:
    for pcap in sorted(PCAP_DIR.rglob("*.pcap")):
        rel_csv = pcap.relative_to(PCAP_DIR).with_suffix(".csv")
        csv_path = CSV_DIR / rel_csv
        if csv_path.exists():
            print(f"Skip {rel_csv} (CSV exists)")
            continue
        print(f"Processing {rel_csv}")
        csv_path.parent.mkdir(parents=True, exist_ok=True)
        process_pcap(str(pcap), str(csv_path))

if __name__ == "__main__":
    main()