Abstracted data loading so that data can be loaded simply through the config file. Moved Mike's LLM data generation code and my own dummy data generating code to the database folder.

This commit is contained in:
Derek Wang
2025-02-21 00:39:25 -08:00
parent 5c25a2b099
commit dd5fc9f83c
7 changed files with 284 additions and 9 deletions

View File

@@ -0,0 +1,44 @@
# Quick script to create basic dummy data for the caching system
# Current theme/usage is in a social media microservice setting such as Twitter
# Current database schema:
### user_id (primary key), str
### username, string
### num_followers, int
### posts, str (for now, can be list later on)
### friends, list[user_id]
import json
import random
# Parameters to change the distribution/random ranges
TOTAL_USERS = 20
MIN_FOLLOWERS, MAX_FOLLOWERS = 0, 5
MIN_POSTS, MAX_POSTS = 5, 10
MIN_FRIENDS, MAX_FRIENDS = 1, 5
# Create the user data
# TODO if we want to vary the user data more, we can inflate the posts into a list of strings and make the strings very long :3"
data = {}
for user_id in range( TOTAL_USERS ):
user = {}
user["user_id"] = str( user_id )
user["username"] = "user" + str( user_id )
user["num_followers"] = random.randint( MIN_FOLLOWERS, MAX_FOLLOWERS )
# Just make a single post for now, can consider mutliple posts later
user["posts"] = f"This is user {user_id}'s post!"
# posts = []
# for post_num in range( random.randint( MIN_FOLLOWERS, MAX_FOLLOWERS ) ):
# posts.append( f"This is user {user_id}'s post {str( post_num + 1 )}!" )
# user["posts"] = posts
friends = []
user["friends"] = random.sample( [u_id for u_id in range( TOTAL_USERS ) if u_id != user_id], random.randint( MIN_FOLLOWERS, MAX_FOLLOWERS ) )
data[user_id] = user
# Load the data into a json object and write it into the datastore folder
json_object = json.dumps( {"_default": data}, indent=4 )
with open( "datastore/basicDummy_sns.json", "w" ) as f:
f.write( json_object )

View File

@@ -0,0 +1,214 @@
{
"_default": {
"0": {
"user_id": "0",
"username": "user0",
"num_followers": 0,
"posts": "This is user 0's post!",
"friends": [
17,
13,
14
]
},
"1": {
"user_id": "1",
"username": "user1",
"num_followers": 1,
"posts": "This is user 1's post!",
"friends": [
8,
17,
15,
3
]
},
"2": {
"user_id": "2",
"username": "user2",
"num_followers": 4,
"posts": "This is user 2's post!",
"friends": [
9,
12
]
},
"3": {
"user_id": "3",
"username": "user3",
"num_followers": 1,
"posts": "This is user 3's post!",
"friends": [
10,
7,
8,
14
]
},
"4": {
"user_id": "4",
"username": "user4",
"num_followers": 0,
"posts": "This is user 4's post!",
"friends": [
13,
0
]
},
"5": {
"user_id": "5",
"username": "user5",
"num_followers": 4,
"posts": "This is user 5's post!",
"friends": []
},
"6": {
"user_id": "6",
"username": "user6",
"num_followers": 3,
"posts": "This is user 6's post!",
"friends": []
},
"7": {
"user_id": "7",
"username": "user7",
"num_followers": 4,
"posts": "This is user 7's post!",
"friends": [
15,
13,
11,
17
]
},
"8": {
"user_id": "8",
"username": "user8",
"num_followers": 2,
"posts": "This is user 8's post!",
"friends": [
4,
19
]
},
"9": {
"user_id": "9",
"username": "user9",
"num_followers": 0,
"posts": "This is user 9's post!",
"friends": [
2,
10
]
},
"10": {
"user_id": "10",
"username": "user10",
"num_followers": 5,
"posts": "This is user 10's post!",
"friends": [
7,
12
]
},
"11": {
"user_id": "11",
"username": "user11",
"num_followers": 1,
"posts": "This is user 11's post!",
"friends": [
12,
8,
18,
4
]
},
"12": {
"user_id": "12",
"username": "user12",
"num_followers": 1,
"posts": "This is user 12's post!",
"friends": [
5,
17,
8,
4
]
},
"13": {
"user_id": "13",
"username": "user13",
"num_followers": 4,
"posts": "This is user 13's post!",
"friends": [
16,
10,
2,
8,
1
]
},
"14": {
"user_id": "14",
"username": "user14",
"num_followers": 3,
"posts": "This is user 14's post!",
"friends": [
11
]
},
"15": {
"user_id": "15",
"username": "user15",
"num_followers": 2,
"posts": "This is user 15's post!",
"friends": [
13,
14
]
},
"16": {
"user_id": "16",
"username": "user16",
"num_followers": 5,
"posts": "This is user 16's post!",
"friends": [
13,
3,
14
]
},
"17": {
"user_id": "17",
"username": "user17",
"num_followers": 3,
"posts": "This is user 17's post!",
"friends": [
0,
19,
2,
16
]
},
"18": {
"user_id": "18",
"username": "user18",
"num_followers": 4,
"posts": "This is user 18's post!",
"friends": [
9
]
},
"19": {
"user_id": "19",
"username": "user19",
"num_followers": 1,
"posts": "This is user 19's post!",
"friends": [
1,
12,
9
]
}
}
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,76 @@
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import warnings
warnings.filterwarnings('ignore')
import re
import random
import json
HUGGINGFACEHUB_API_TOKEN = None
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
def parse_profile(text, user_id, num_users):
match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
if not match:
return None # Skip invalid responses
name, bio, posts = match.groups()
# Generate mock followers count (randomized for realism)
followers = random.randint(10, 5000)
# Generate mock friends (users with nearby IDs)
friend_ids = [str(fid) for fid in range(num_users) if fid != user_id]
random.shuffle(friend_ids)
friends = friend_ids[:random.randint(1, num_users-1)] # Each user gets 1-5 friends
return {
"user_id": str(user_id),
"name": name.strip(),
"followers": followers,
"bio": bio.strip(),
"posts": posts.strip(),
"friends": friends
}
def generate_data(num_users):
system_message = """You are a data generator creating user profiles for a social media app.
Always provide user profiles in this format: Name | Interest | Recent Activity.
Do not include numbers, IDs, or assistant labels. Only return a properly formatted response.
Example: Alice Wonderland | Exploring the world one frame at a time! | Just captured a stunning sunset."""
# prompt = PromptTemplate.from_template(template)
prompt = ChatPromptTemplate ([
("system", system_message),
("user", "Generate a user profile for user {user_id}")
])
llm = HuggingFaceEndpoint(
task='text-generation',
model="deepseek-ai/DeepSeek-R1",
max_new_tokens=150,
do_sample=True,
top_k=60,
temperature=1.0,
top_p=0.9,
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)
llm_chain = prompt | llm
data = {}
for i in range(num_users):
raw_text = llm_chain.invoke({"user_id": i})
user_profile = parse_profile(raw_text, i, num_users)
if user_profile:
data[i] = user_profile
return data
if __name__ == "__main__":
data = generate_data(100)
# Create json file
json_object = json.dumps( {"_default": data}, indent=4 )
with open( "datastore/llmData_sns.json", "w" ) as f:
f.write( json_object )