fixed issue with generate data

This commit is contained in:
HiccupHan 2025-02-21 11:55:24 -08:00
parent dee97f09c2
commit 97d0fa21fd
2 changed files with 6154 additions and 8 deletions

File diff suppressed because one or more lines are too long

@ -10,11 +10,15 @@ import json
HUGGINGFACEHUB_API_TOKEN = None
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
def parse_profile(text, user_id, num_users):
def valid_data(text):
match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
if not match:
return None # Skip invalid responses
return False
else:
return True
def parse_profile(text, user_id, num_users):
match = re.search(r"([A-Za-z ]+)\|([A-Za-z &\-!]+)\|([A-Za-z .',!?&\-]+)", text)
name, bio, posts = match.groups()
# Generate mock followers count (randomized for realism)
@ -40,7 +44,6 @@ def generate_data(num_users):
Do not include numbers, IDs, or assistant labels. Only return a properly formatted response.
Example: Alice Wonderland | Exploring the world one frame at a time! | Just captured a stunning sunset."""
# prompt = PromptTemplate.from_template(template)
prompt = ChatPromptTemplate ([
("system", system_message),
("user", "Generate a user profile for user {user_id}")
@ -58,11 +61,16 @@ def generate_data(num_users):
)
llm_chain = prompt | llm
data = {}
for i in range(num_users):
i = 0
user_id = 0
while user_id < num_users:
raw_text = llm_chain.invoke({"user_id": i})
user_profile = parse_profile(raw_text, i, num_users)
if user_profile:
data[i] = user_profile
while not valid_data(raw_text):
i = i + 1
raw_text = llm_chain.invoke({"user_id": i})
user_profile = parse_profile(raw_text, user_id, num_users)
user_id = user_id + 1
data[user_id] = user_profile
return data