Skip to content

Commit

Permalink
GMD improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Darren Edge committed Nov 4, 2024
1 parent 23580ce commit 8036391
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 6 deletions.
2 changes: 1 addition & 1 deletion app/workflows/generate_mock_data/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def create_session(self, prefix):
self.workflow_object = SessionVariable(GenerateMockData(), prefix)
self.schema = SessionVariable(create_boilerplate_schema(), prefix)
self.num_records_overall = SessionVariable(100, prefix)
self.records_per_batch = SessionVariable(20, prefix)
self.records_per_batch = SessionVariable(10, prefix)
self.duplicate_records_per_batch = SessionVariable(0, prefix)
self.related_records_per_batch = SessionVariable(0, prefix)
self.primary_record_array = SessionVariable('', prefix)
Expand Down
1 change: 1 addition & 0 deletions app/workflows/generate_mock_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ async def create(sv: bds_variables.SessionVariables, workflow: None):
dl_placeholders.append(dl_placeholder)

def on_dfs_update(path_to_df):
print(path_to_df)
for ix, record_array in enumerate(sv.record_arrays.value):
with df_placeholders[ix]:
df = path_to_df[record_array]
Expand Down
2 changes: 1 addition & 1 deletion intelligence_toolkit/generate_mock_data/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ async def generate_data_records(
temperature: float = 0.5,
df_update_callback=None,
callback_batch=None,
parallel_batches: int = 0,
parallel_batches: int = 5,
):
"""
Generates structured data records according to the JSON schema
Expand Down
6 changes: 3 additions & 3 deletions intelligence_toolkit/generate_mock_data/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ async def generate_data(
temperature,
df_update_callback,
callback_batch,
parallel_batches=0,
parallel_batches=5,
):
if parallel_batches == 0:
parallel_batches = num_records_overall // records_per_batch
num_iterations = num_records_overall // (records_per_batch * parallel_batches)
record_arrays = extract_array_fields(data_schema)
primary_record_array = record_arrays[0]
Expand Down Expand Up @@ -65,6 +63,7 @@ async def generate_data(
)

for new_object in new_objects:
print(new_object)
new_object_json = loads(new_object)
generated_objects.append(new_object_json)
current_object_json, conflicts = merge_json_objects(
Expand All @@ -74,6 +73,7 @@ async def generate_data(
for record_array in record_arrays:
df = extract_df(current_object_json, record_array)
dfs[".".join(record_array)] = df

if df_update_callback is not None:
df_update_callback(dfs)
return current_object_json, dfs
Expand Down
2 changes: 1 addition & 1 deletion intelligence_toolkit/generate_mock_data/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
seeded_data_generation_prompt = """
You are a helpful assistant tasked with generating a JSON object following the JSON schema provided. You should generate mock data that is plausible but not linked to any real-world entities (e.g., person, organization).
The JSON object may contain multiple arrays representing collections of data records. For the purposes of this task, only consider the primary record array specified when counting records and generate any other auxiliary records as needed to complete and/or connect these primary records.
The JSON object may contain multiple arrays representing collections of data records. For the purposes of this task, only consider the primary record array specified when counting records. Generate any other auxiliary record arrays as needed to complete and/or connect these primary records.
The seed record provided should be used to generate certain numbers of records in the output object that are either near duplicates or close relations of the seed record, as follows:
Expand Down

0 comments on commit 8036391

Please sign in to comment.