1
0
mirror of https://github.com/ltcptgeneral/cs239-caching.git synced 2025-04-01 20:33:26 +00:00

Add to db instead of creating new one

This commit is contained in:
HiccupHan 2025-02-22 19:55:26 -08:00
parent b5e6f5eb9f
commit 5dc9a99a3a
4 changed files with 9057 additions and 30 deletions

@ -1,11 +1,29 @@
from .cache import BaselineCache
from .cache import Cache
from database import get_user_profile
from collections import OrderedDict
import math
class PrefetchCache(BaselineCache):
key_relations = None
class PrefetchCache(Cache):
limit = None
cache = None
def __init__(self):
def __init__(self, limit):
super()
self.key_relations = dict()
self.limit = limit
self.cache = OrderedDict()
def __eq__(self, other):
return self.cache == other
def __len__(self):
return len(self.cache)
def get(self, key: str) -> str:
if key in self.cache:
self.cache.move_to_end(key)
return self.cache[key]
else:
return None
def put(self, key: str, val: str) -> bool:
# LRU evict
@ -14,16 +32,29 @@ class PrefetchCache(BaselineCache):
self.cache.popitem(last = False)
evict = True
self.cache[key] = val
self.prefetch(key, val)
if self.prefetch(val):
evict = True
return evict
def prefetch(self, key: str, val: str) -> bool:
if len(self.cache) >= self.limit and key in self.key_relations:
self.cache[self.key_relations[key][0]] = self.key_relations[key][1]
return True
return False
def prefetch(self, profile) -> bool:
evict = False
for i in range(math.ceil(self.limit*0.1)):
if i < len(profile["friends"]):
data = get_user_profile(profile["friends"][i])
if len(self.cache) >= self.limit:
self.cache.popitem(last = False)
evict = True
self.cache[profile["friends"][i]] = data
else:
break
return evict
def set_relations(self, key: str, related_key: str, related_val: str):
self.key_relations[key] = related_key | related_val
return
def invalidate(self, key: str) -> bool:
# basic delete invalidation, no (p)refetching
if key in self.cache:
del self.cache[key]
return True
else:
return False

@ -28,7 +28,7 @@ def init_db():
# Prepopulate database with some sample users if empty
if len(db) == 0:
db.insert_multiple([
{"user_id": "1", "name": "Alice", "followers": 100, "bio": "Love coding!", "posts": "Hello, world!"},
{"user_id": "2", "name": "Bob", "followers": 200, "bio": "Tech enthusiast", "posts": "AI is amazing!"},
{"user_id": "3", "name": "Charlie", "followers": 50, "bio": "Blogger", "posts": "Check out my latest post!"}
{"user_id": "1", "name": "Alice", "followers": 100, "bio": "Love coding!", "posts": "Hello, world!", "friends": ["2"]},
{"user_id": "2", "name": "Bob", "followers": 200, "bio": "Tech enthusiast", "posts": "AI is amazing!","friends": ["3", "1"]},
{"user_id": "3", "name": "Charlie", "followers": 50, "bio": "Blogger", "posts": "Check out my latest post!", "friends": ["1"]}
])

File diff suppressed because it is too large Load Diff

@ -6,6 +6,10 @@ warnings.filterwarnings('ignore')
import re
import random
import json
from tinydb import TinyDB
from tinydb.storages import JSONStorage
from tinydb.middlewares import CachingMiddleware
import math
HUGGINGFACEHUB_API_TOKEN = None
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
@ -21,13 +25,11 @@ def parse_profile(text, user_id, num_users):
match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
name, bio, posts = match.groups()
# Generate mock followers count (randomized for realism)
followers = random.randint(10, 5000)
# Generate mock friends (users with nearby IDs)
friend_ids = [str(fid) for fid in range(num_users) if fid != user_id]
random.shuffle(friend_ids)
friends = friend_ids[:random.randint(1, num_users-1)] # Each user gets 1-5 friends
friends = friend_ids[:random.randint(1, min(100, math.ceil(num_users/3)))]
return {
"user_id": str(user_id),
@ -38,7 +40,7 @@ def parse_profile(text, user_id, num_users):
"friends": friends
}
def generate_data(num_users):
def generate_data(base_id, num_users):
system_message = """You are a data generator creating user profiles for a social media app.
Always provide user profiles in this format: Name | Interest | Recent Activity.
Do not include numbers, IDs, or assistant labels. Only return a properly formatted response.
@ -60,25 +62,30 @@ def generate_data(num_users):
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)
llm_chain = prompt | llm
data = {}
i = 0
data = []
i = base_id
user_id = 0
while user_id < num_users:
raw_text = llm_chain.invoke({"user_id": i})
while not valid_data(raw_text):
i = i + 1
raw_text = llm_chain.invoke({"user_id": i})
user_profile = parse_profile(raw_text, user_id, num_users)
user_profile = parse_profile(raw_text, base_id + user_id, num_users)
user_id = user_id + 1
data[user_id] = user_profile
i = i + 1
data.append(user_profile)
return data
if __name__ == "__main__":
data = generate_data(100)
base_id = input("Enter base id (check db to find the next consecutive user_id): ")
num_users = input("Enter number of users to generate: ")
data = generate_data(int(base_id), int(num_users))
# Create json file
json_object = json.dumps( {"_default": data}, indent=4 )
with open( "datastore/llmData_sns.json", "w" ) as f:
f.write( json_object )
file_path = "datastore/llmData_sns.json"
global db
db = TinyDB(file_path, storage=CachingMiddleware(JSONStorage), indent=4)
db.insert_multiple(data)
db.close()