Deepseek data generation

This commit is contained in:
HiccupHan 2025-02-20 23:33:57 -08:00
parent 3707a2aae9
commit 5c25a2b099
4 changed files with 76 additions and 11 deletions

@ -24,5 +24,6 @@ class PrefetchCache(BaselineCache):
return True
return False
def set_relations(self):
def set_relations(self, key: str, related_key: str, related_val: str):
self.key_relations[key] = related_key | related_val
return

@ -1,5 +1,6 @@
from tinydb import TinyDB, Query
from generate_data import generate_data
# Initialize TinyDB as a NoSQL key-value store
DB_FILE = "database.json"
@ -11,9 +12,9 @@ def get_user_profile(user_id):
result = db.search(User.user_id == user_id)
return result[0] if result else None
def update_user_profile(user_id, name, followers, bio, posts):
def update_user_profile(user_id, name, followers, bio, posts, friends):
"""Update user profile in TinyDB"""
db.upsert({"user_id": user_id, "name": name, "followers": followers, "bio": bio, "posts": posts}, User.user_id == user_id)
db.upsert({"user_id": user_id, "name": name, "followers": followers, "bio": bio, "posts": posts, "friends": friends}, User.user_id == user_id)
def init_db():
"""Ensure TinyDB is initialized before FastAPI starts and prepopulate some data"""
@ -22,8 +23,5 @@ def init_db():
# Prepopulate database with some sample users if empty
if len(db) == 0:
db.insert_multiple([
{"user_id": "1", "name": "Alice", "followers": 100, "bio": "Love coding!", "posts": "Hello, world!"},
{"user_id": "2", "name": "Bob", "followers": 200, "bio": "Tech enthusiast", "posts": "AI is amazing!"},
{"user_id": "3", "name": "Charlie", "followers": 50, "bio": "Blogger", "posts": "Check out my latest post!"}
])
data = generate_data(100)
db.insert_multiple(data)

66
app/generate_data.py Normal file

@ -0,0 +1,66 @@
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import warnings
warnings.filterwarnings('ignore')
import re
import random
HUGGINGFACEHUB_API_TOKEN = None
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
def parse_profile(text, user_id, num_users):
match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
if not match:
return None # Skip invalid responses
name, bio, posts = match.groups()
# Generate mock followers count (randomized for realism)
followers = random.randint(10, 5000)
# Generate mock friends (users with nearby IDs)
friend_ids = [str(fid) for fid in range(num_users) if fid != user_id]
random.shuffle(friend_ids)
friends = friend_ids[:random.randint(1, num_users-1)] # Each user gets 1-5 friends
return {
"user_id": str(user_id),
"name": name.strip(),
"followers": followers,
"bio": bio.strip(),
"posts": posts.strip(),
"friends": friends
}
def generate_data(num_users):
system_message = """You are a data generator creating user profiles for a social media app.
Always provide user profiles in this format: Name | Interest | Recent Activity.
Do not include numbers, IDs, or assistant labels. Only return a properly formatted response.
Example: Alice Wonderland | Exploring the world one frame at a time! | Just captured a stunning sunset."""
# prompt = PromptTemplate.from_template(template)
prompt = ChatPromptTemplate ([
("system", system_message),
("user", "Generate a user profile for user {user_id}")
])
llm = HuggingFaceEndpoint(
task='text-generation',
model="deepseek-ai/DeepSeek-R1",
max_new_tokens=150,
do_sample=True,
top_k=60,
temperature=1.0,
top_p=0.9,
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)
llm_chain = prompt | llm
data = []
for i in range(num_users):
raw_text = llm_chain.invoke({"user_id": i})
user_profile = parse_profile(raw_text, i, num_users)
if user_profile:
data.append(user_profile)
return data

@ -13,7 +13,7 @@ app = FastAPI()
if CACHE_STRATEGY == "Baseline":
cache = BaselineCache(limit=CACHE_LIMIT)
elif CACHE_STRATEGY == "Prefetch":
cache = PrefetchCache()
cache = PrefetchCache(limit=CACHE_LIMIT)
elif CACHE_STRATEGY == "Tiered":
cache = TieredCache(limit=CACHE_LIMIT, l2_limit=L2_CACHE_LIMIT)
elif CACHE_STRATEGY == "Seive":
@ -37,8 +37,8 @@ def fetch_user_profile(user_id: str):
return {"user_id": user_id, "profile": profile, "source": "database", "time_ms": (time.time() - start) * 1000}
@app.post("/update_user/")
def modify_user_profile(user_id: str, name: str, followers: int, bio: str, posts: str):
def modify_user_profile(user_id: str, name: str, followers: int, bio: str, posts: str, friends: list[str]):
"""Update user profile and refresh cache"""
update_user_profile(user_id, name, followers, bio, posts)
update_user_profile(user_id, name, followers, bio, posts, friends)
cache.invalidate(user_id) # Invalidate old cache
return {"message": "User profile updated successfully"}