GMD improvements

microsoft · Nov 4, 2024 · 8036391 · 8036391
1 parent 23580ce
commit 8036391
Show file tree

Hide file tree

Showing 5 changed files with 7 additions and 6 deletions.
diff --git a/app/workflows/generate_mock_data/variables.py b/app/workflows/generate_mock_data/variables.py
@@ -21,7 +21,7 @@ def create_session(self, prefix):
         self.workflow_object = SessionVariable(GenerateMockData(), prefix)
         self.schema = SessionVariable(create_boilerplate_schema(), prefix)
         self.num_records_overall = SessionVariable(100, prefix)
-        self.records_per_batch = SessionVariable(20, prefix)
+        self.records_per_batch = SessionVariable(10, prefix)
         self.duplicate_records_per_batch = SessionVariable(0, prefix)
         self.related_records_per_batch = SessionVariable(0, prefix)
         self.primary_record_array = SessionVariable('', prefix)

diff --git a/app/workflows/generate_mock_data/workflow.py b/app/workflows/generate_mock_data/workflow.py
@@ -79,6 +79,7 @@ async def create(sv: bds_variables.SessionVariables, workflow: None):
                         dl_placeholders.append(dl_placeholder)                
 
                 def on_dfs_update(path_to_df):
+                    print(path_to_df)
                     for ix, record_array in enumerate(sv.record_arrays.value):
                         with df_placeholders[ix]:
                             df = path_to_df[record_array]

diff --git a/intelligence_toolkit/generate_mock_data/api.py b/intelligence_toolkit/generate_mock_data/api.py
@@ -34,7 +34,7 @@ async def generate_data_records(
         temperature: float = 0.5,
         df_update_callback=None,
         callback_batch=None,
-        parallel_batches: int = 0,
+        parallel_batches: int = 5,
     ):
         """
         Generates structured data records according to the JSON schema

diff --git a/intelligence_toolkit/generate_mock_data/data_generator.py b/intelligence_toolkit/generate_mock_data/data_generator.py
@@ -22,10 +22,8 @@ async def generate_data(
     temperature,
     df_update_callback,
     callback_batch,
-    parallel_batches=0,
+    parallel_batches=5,
 ):
-    if parallel_batches == 0:
-        parallel_batches = num_records_overall // records_per_batch
     num_iterations = num_records_overall // (records_per_batch * parallel_batches)
     record_arrays = extract_array_fields(data_schema)
     primary_record_array = record_arrays[0]
@@ -65,6 +63,7 @@ async def generate_data(
         )
 
         for new_object in new_objects:
+            print(new_object)
             new_object_json = loads(new_object)
             generated_objects.append(new_object_json)
             current_object_json, conflicts = merge_json_objects(
@@ -74,6 +73,7 @@ async def generate_data(
         for record_array in record_arrays:
             df = extract_df(current_object_json, record_array)
             dfs[".".join(record_array)] = df
+
         if df_update_callback is not None:
             df_update_callback(dfs)
     return current_object_json, dfs

diff --git a/intelligence_toolkit/generate_mock_data/prompts.py b/intelligence_toolkit/generate_mock_data/prompts.py
@@ -19,7 +19,7 @@
 seeded_data_generation_prompt = """
 You are a helpful assistant tasked with generating a JSON object following the JSON schema provided. You should generate mock data that is plausible but not linked to any real-world entities (e.g., person, organization). 
 
-The JSON object may contain multiple arrays representing collections of data records. For the purposes of this task, only consider the primary record array specified when counting records and generate any other auxiliary records as needed to complete and/or connect these primary records.
+The JSON object may contain multiple arrays representing collections of data records. For the purposes of this task, only consider the primary record array specified when counting records. Generate any other auxiliary record arrays as needed to complete and/or connect these primary records.
 
 The seed record provided should be used to generate certain numbers of records in the output object that are either near duplicates or close relations of the seed record, as follows: