diff --git a/app/workflows/generate_mock_data/variables.py b/app/workflows/generate_mock_data/variables.py index e1d0bbfc..255f7e16 100644 --- a/app/workflows/generate_mock_data/variables.py +++ b/app/workflows/generate_mock_data/variables.py @@ -21,7 +21,7 @@ def create_session(self, prefix): self.workflow_object = SessionVariable(GenerateMockData(), prefix) self.schema = SessionVariable(create_boilerplate_schema(), prefix) self.num_records_overall = SessionVariable(100, prefix) - self.records_per_batch = SessionVariable(20, prefix) + self.records_per_batch = SessionVariable(10, prefix) self.duplicate_records_per_batch = SessionVariable(0, prefix) self.related_records_per_batch = SessionVariable(0, prefix) self.primary_record_array = SessionVariable('', prefix) diff --git a/app/workflows/generate_mock_data/workflow.py b/app/workflows/generate_mock_data/workflow.py index 8bff47f1..fabf5d01 100644 --- a/app/workflows/generate_mock_data/workflow.py +++ b/app/workflows/generate_mock_data/workflow.py @@ -79,6 +79,7 @@ async def create(sv: bds_variables.SessionVariables, workflow: None): dl_placeholders.append(dl_placeholder) def on_dfs_update(path_to_df): + print(path_to_df) for ix, record_array in enumerate(sv.record_arrays.value): with df_placeholders[ix]: df = path_to_df[record_array] diff --git a/intelligence_toolkit/generate_mock_data/api.py b/intelligence_toolkit/generate_mock_data/api.py index 2cc89b4a..6b2ecc72 100644 --- a/intelligence_toolkit/generate_mock_data/api.py +++ b/intelligence_toolkit/generate_mock_data/api.py @@ -34,7 +34,7 @@ async def generate_data_records( temperature: float = 0.5, df_update_callback=None, callback_batch=None, - parallel_batches: int = 0, + parallel_batches: int = 5, ): """ Generates structured data records according to the JSON schema diff --git a/intelligence_toolkit/generate_mock_data/data_generator.py b/intelligence_toolkit/generate_mock_data/data_generator.py index cb1adaf3..cfcbbfca 100644 --- a/intelligence_toolkit/generate_mock_data/data_generator.py +++ b/intelligence_toolkit/generate_mock_data/data_generator.py @@ -22,10 +22,8 @@ async def generate_data( temperature, df_update_callback, callback_batch, - parallel_batches=0, + parallel_batches=5, ): - if parallel_batches == 0: - parallel_batches = num_records_overall // records_per_batch num_iterations = num_records_overall // (records_per_batch * parallel_batches) record_arrays = extract_array_fields(data_schema) primary_record_array = record_arrays[0] @@ -65,6 +63,7 @@ async def generate_data( ) for new_object in new_objects: + print(new_object) new_object_json = loads(new_object) generated_objects.append(new_object_json) current_object_json, conflicts = merge_json_objects( @@ -74,6 +73,7 @@ async def generate_data( for record_array in record_arrays: df = extract_df(current_object_json, record_array) dfs[".".join(record_array)] = df + if df_update_callback is not None: df_update_callback(dfs) return current_object_json, dfs diff --git a/intelligence_toolkit/generate_mock_data/prompts.py b/intelligence_toolkit/generate_mock_data/prompts.py index 0231d30a..8bf3203a 100644 --- a/intelligence_toolkit/generate_mock_data/prompts.py +++ b/intelligence_toolkit/generate_mock_data/prompts.py @@ -19,7 +19,7 @@ seeded_data_generation_prompt = """ You are a helpful assistant tasked with generating a JSON object following the JSON schema provided. You should generate mock data that is plausible but not linked to any real-world entities (e.g., person, organization). -The JSON object may contain multiple arrays representing collections of data records. For the purposes of this task, only consider the primary record array specified when counting records and generate any other auxiliary records as needed to complete and/or connect these primary records. +The JSON object may contain multiple arrays representing collections of data records. For the purposes of this task, only consider the primary record array specified when counting records. Generate any other auxiliary record arrays as needed to complete and/or connect these primary records. The seed record provided should be used to generate certain numbers of records in the output object that are either near duplicates or close relations of the seed record, as follows: