fixed issue with generate data

2025-05-04 10:07:23 +00:00 · 2025-02-21 11:55:24 -08:00 · 2025-02-21 11:55:24 -08:00 · 97d0fa21fd
commit 97d0fa21fd
parent dee97f09c2
2 changed files with 6154 additions and 8 deletions
--- a/app/database/datastore/llmData_sns.json
+++ b/app/database/datastore/llmData_sns.json
--- a/app/database/generate_data.py
+++ b/app/database/generate_data.py
@ -10,11 +10,15 @@ import json
 HUGGINGFACEHUB_API_TOKEN = None
 os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

-def parse_profile(text, user_id, num_users):
+def valid_data(text):
    match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
    if not match:
-        return None  # Skip invalid responses
+        return False
+    else:
+        return True

+def parse_profile(text, user_id, num_users):
+    match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
    name, bio, posts = match.groups()
    
    # Generate mock followers count (randomized for realism)
@ -40,7 +44,6 @@ def generate_data(num_users):
    Do not include numbers, IDs, or assistant labels. Only return a properly formatted response.

    Example: Alice Wonderland | Exploring the world one frame at a time! | Just captured a stunning sunset."""
-    # prompt = PromptTemplate.from_template(template)
    prompt = ChatPromptTemplate ([
        ("system", system_message),
        ("user", "Generate a user profile for user {user_id}")
@ -58,11 +61,16 @@ def generate_data(num_users):
    )
    llm_chain = prompt | llm
    data = {}
-    for i in range(num_users): 
+    i = 0
+    user_id = 0
+    while user_id < num_users: 
        raw_text = llm_chain.invoke({"user_id": i})
-        user_profile = parse_profile(raw_text, i, num_users)
-        if user_profile:
-            data[i] = user_profile
+        while not valid_data(raw_text):
+            i = i + 1
+            raw_text = llm_chain.invoke({"user_id": i})
+        user_profile = parse_profile(raw_text, user_id, num_users)
+        user_id = user_id + 1
+        data[user_id] = user_profile
            
    return data