Add to db instead of creating new one

2025-05-15 06:17:22 +00:00 · 2025-02-22 19:55:26 -08:00 · 2025-02-22 19:55:26 -08:00 · 5dc9a99a3a
commit 5dc9a99a3a
parent b5e6f5eb9f
4 changed files with 9057 additions and 30 deletions
--- a/app/cache/prefetch_cache.py
+++ b/app/cache/prefetch_cache.py
@ -1,11 +1,29 @@
-from .cache import BaselineCache
+from .cache import Cache
+from database import get_user_profile
+from collections import OrderedDict
+import math

-class PrefetchCache(BaselineCache):
-    key_relations = None
+class PrefetchCache(Cache):
+    limit = None
+    cache = None
    
-    def __init__(self):
+    def __init__(self, limit):
        super()
-        self.key_relations = dict()
+        self.limit = limit
+        self.cache = OrderedDict()
+        
+    def __eq__(self, other):
+        return self.cache == other
+    
+    def __len__(self):
+        return len(self.cache)
+
+    def get(self, key: str) -> str:
+        if key in self.cache:
+            self.cache.move_to_end(key)
+            return self.cache[key]
+        else:
+            return None
        
    def put(self, key: str, val: str) -> bool:
        # LRU evict
@ -14,16 +32,29 @@ class PrefetchCache(BaselineCache):
            self.cache.popitem(last = False)
            evict = True
        self.cache[key] = val   
-        self.prefetch(key, val)
+        if self.prefetch(val):
+            evict = True
        
        return evict
    
-    def prefetch(self, key: str, val: str) -> bool:
-        if len(self.cache) >= self.limit and key in self.key_relations:
-            self.cache[self.key_relations[key][0]] = self.key_relations[key][1]
-            return True
-        return False
+    def prefetch(self, profile) -> bool:
+        evict = False
+        for i in range(math.ceil(self.limit*0.1)):
+            if i < len(profile["friends"]):
+                data = get_user_profile(profile["friends"][i])
+                if len(self.cache) >= self.limit:
+                    self.cache.popitem(last = False)
+                    evict = True
+                self.cache[profile["friends"][i]] = data
+            else:
+                break
+        return evict

-    def set_relations(self, key: str, related_key: str, related_val: str):
-        self.key_relations[key] = related_key | related_val
-        return
+    def invalidate(self, key: str) -> bool:
+        # basic delete invalidation, no (p)refetching
+        if key in self.cache:
+            del self.cache[key]
+            return True
+        else:
+            return False
+    
--- a/app/database.py
+++ b/app/database.py
@ -28,7 +28,7 @@ def init_db():
    # Prepopulate database with some sample users if empty
    if len(db) == 0:
        db.insert_multiple([
-            {"user_id": "1", "name": "Alice", "followers": 100, "bio": "Love coding!", "posts": "Hello, world!"},
-            {"user_id": "2", "name": "Bob", "followers": 200, "bio": "Tech enthusiast", "posts": "AI is amazing!"},
-            {"user_id": "3", "name": "Charlie", "followers": 50, "bio": "Blogger", "posts": "Check out my latest post!"}
+            {"user_id": "1", "name": "Alice", "followers": 100, "bio": "Love coding!", "posts": "Hello, world!", "friends": ["2"]},
+            {"user_id": "2", "name": "Bob", "followers": 200, "bio": "Tech enthusiast", "posts": "AI is amazing!","friends": ["3", "1"]},
+            {"user_id": "3", "name": "Charlie", "followers": 50, "bio": "Blogger", "posts": "Check out my latest post!", "friends": ["1"]}
        ])
--- a/app/database/datastore/llmData_sns.json
+++ b/app/database/datastore/llmData_sns.json
--- a/app/database/generate_data.py
+++ b/app/database/generate_data.py
@ -6,6 +6,10 @@ warnings.filterwarnings('ignore')
 import re
 import random
 import json
+from tinydb import TinyDB
+from tinydb.storages import JSONStorage
+from tinydb.middlewares import CachingMiddleware
+import math

 HUGGINGFACEHUB_API_TOKEN = None
 os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
@ -21,13 +25,11 @@ def parse_profile(text, user_id, num_users):
    match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
    name, bio, posts = match.groups()
    
-    # Generate mock followers count (randomized for realism)
    followers = random.randint(10, 5000)

-    # Generate mock friends (users with nearby IDs)
    friend_ids = [str(fid) for fid in range(num_users) if fid != user_id]
    random.shuffle(friend_ids)
-    friends = friend_ids[:random.randint(1, num_users-1)]  # Each user gets 1-5 friends
+    friends = friend_ids[:random.randint(1, min(100, math.ceil(num_users/3)))] 

    return {
        "user_id": str(user_id),
@ -38,7 +40,7 @@ def parse_profile(text, user_id, num_users):
        "friends": friends
    }

-def generate_data(num_users):
+def generate_data(base_id, num_users):
    system_message = """You are a data generator creating user profiles for a social media app. 
    Always provide user profiles in this format: Name | Interest | Recent Activity.
    Do not include numbers, IDs, or assistant labels. Only return a properly formatted response.
@ -60,25 +62,30 @@ def generate_data(num_users):
        huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    )
    llm_chain = prompt | llm
-    data = {}
-    i = 0
+    data = []
+    i = base_id
    user_id = 0
    while user_id < num_users: 
        raw_text = llm_chain.invoke({"user_id": i})
        while not valid_data(raw_text):
            i = i + 1
            raw_text = llm_chain.invoke({"user_id": i})
-        user_profile = parse_profile(raw_text, user_id, num_users)
+        user_profile = parse_profile(raw_text, base_id + user_id, num_users)
        user_id = user_id + 1
-        data[user_id] = user_profile
+        i = i + 1
+        data.append(user_profile)
            
    return data

 if __name__ == "__main__":
-    data = generate_data(100)
-
+    base_id = input("Enter base id (check db to find the next consecutive user_id): ")
+    num_users = input("Enter number of users to generate: ")
+    data = generate_data(int(base_id), int(num_users))
+    
    # Create json file
-    json_object = json.dumps( {"_default": data}, indent=4 )
-    with open( "datastore/llmData_sns.json", "w" ) as f:
-        f.write( json_object )
+    file_path = "datastore/llmData_sns.json"
+    global db
+    db = TinyDB(file_path, storage=CachingMiddleware(JSONStorage), indent=4)
+    db.insert_multiple(data)
+    db.close()