Structured data apis (#68)

* Updated GMD workflow to use API * Example notebook for GMD * ERD API and notebook --------- Co-authored-by: Darren Edge <[email protected]> Co-authored-by: Dayenne Souza <[email protected]>
microsoft · Oct 23, 2024 · e3d37a8 · e3d37a8
1 parent 6aba99b
commit e3d37a8
Show file tree

Hide file tree

Showing 22 changed files with 771 additions and 138 deletions.
diff --git a/app/util/schema_ui.py b/app/util/schema_ui.py
@@ -153,7 +153,7 @@ def generate_form_from_json_schema(global_schema, default_schema, field_location
                 return
         else:
             if key != 'type':
-                new_value = st.text_input(f'`{key}` metadata', key=f'{key_with_prefix}_label', value=value)
+                new_value = st.text_input(f'{key}', key=f'{key_with_prefix}_label', value=value)
                 if new_value != value:
                     field_location[key] = new_value
                     st.rerun()

diff --git a/app/workflows/extract_record_data/README.md b/app/workflows/extract_record_data/README.md
@@ -8,12 +8,13 @@ Select the `View example outputs` tab (in app) or navigate to [example_outputs/e
 
 1. [**Input**] An instance or collection of unstructured text and (optionally) an existing JSON file containing the JSON schema with which to generate output records.
 2. [**Process**] The user edits the uploaded JSON schema or creates one interactively.
-3. [**AI Calls**] The system uses generative AI to extract a JSON object from the text following the JSON schema.
+3. [**AI Calls**] The system uses generative AI to extract a JSON object from each text following the JSON schema.
 4. [**Output**] A dataset of structured records following the JSON schema and (optionally) a newly-defined JSON schema.
 
 ## Input requirements
 
 - The input schema, if provided, should be a JSON file conforming to the [JSON schema standard](https://json-schema.org/) and following the restrictions of the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas).
+- The unstructured text is sent to the AI API for record extraction (either OpenAI or Azure OpenAI). Such data use must comply with all applicable laws, regulations, and policies, including those pertaining to privacy and security.
 
 ## Use with other workflows
 
@@ -41,7 +42,6 @@ In the top left, you have the option to upload an existing JSON schema. This is
 The initial JSON schema contains some boilerplate metadata fields representing best practices for schema design. The metadata fields are as follows:
 
 - `$schema`: Indicates the version of the json-schema standard that the current schema follows. Leave this field as it is.
-- `$id`: Provides a global ID for this schema anchored in a web domain, e.g., that of your organization. You may wish to edit this if you expect your schema to be widely used, but it can be left as it is for use inside Intelligence Toolkit.
 - `title`: A short title indicating the kind of data that the schema represents.
 - `description`: A longer description of the kind of data that the schema represents.
 

diff --git a/app/workflows/extract_record_data/variables.py b/app/workflows/extract_record_data/variables.py
@@ -3,6 +3,7 @@
 #
 import streamlit as st
 from app.util.session_variable import SessionVariable
+from toolkit.extract_record_data import ExtractRecordData
 from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema
 
 class SessionVariables:
@@ -13,6 +14,7 @@ def __init__(self, prefix):
         self.create_session(prefix)
 
     def create_session(self, prefix):
+        self.workflow_object = SessionVariable(ExtractRecordData(), prefix)
         self.schema = SessionVariable(create_boilerplate_schema(), prefix)
         self.loaded_schema_filename = SessionVariable('', prefix)
         self.loaded_data_filename = SessionVariable('', prefix)

diff --git a/app/workflows/extract_record_data/workflow.py b/app/workflows/extract_record_data/workflow.py
@@ -22,7 +22,8 @@ def get_intro():
 
 async def create(sv: variables.SessionVariables, workflow: None):
     ui_components.check_ai_configuration()
-
+    erd = sv.workflow_object.value
+    erd.set_ai_configuration(ai_configuration)
     intro_tab, schema_tab, generator_tab, mock_tab = st.tabs(['Extract Record Data workflow:', 'Prepare data schema', 'Extract structured records', 'View example outputs'])
     with intro_tab:
         file_content = get_intro()
@@ -35,8 +36,8 @@ async def create(sv: variables.SessionVariables, workflow: None):
     with schema_tab:
         sv.loaded_schema_filename.value = schema_ui.build_schema_ui(
             sv.schema.value, sv.loaded_schema_filename.value)
-        array_field_arrays = data_extractor.extract_array_fields(sv.schema.value)
-        sv.record_arrays.value = ['.'.join(a) for a in array_field_arrays]
+        erd.set_schema(sv.schema.value)
+        sv.record_arrays.value = [".".join(x) for x in erd.record_arrays]
     with generator_tab:
         d1, d2 = st.columns([1, 1])
         with d1:
@@ -92,19 +93,16 @@ def on_dfs_update(path_to_df):
                         for placeholder in df_placeholders:
                             placeholder.empty()
 
-                        (
-                            sv.final_object.value,
-                            sv.generated_objects.value,
-                            sv.generated_dfs.value
-                        ) = await data_extractor.extract_record_data(
-                            ai_configuration=ai_configuration,
+
+
+                        await erd.extract_record_data(
                             input_texts=input_texts,
                             generation_guidance=sv.generation_guidance.value,
-                            record_arrays=sv.record_arrays.value,
-                            data_schema=sv.schema.value,
                             df_update_callback=on_dfs_update,
                             callback_batch=None
                         )
+                        sv.final_object.value = erd.json_object,
+                        sv.generated_dfs.value = erd.array_dfs
 
                     for ix, record_array in enumerate(sv.record_arrays.value):
                             with df_placeholders[ix]:

diff --git a/app/workflows/generate_mock_data/README.md b/app/workflows/generate_mock_data/README.md
@@ -34,28 +34,20 @@ In the top left, you have the option to upload an existing JSON schema. This is
 The initial JSON schema contains some boilerplate metadata fields representing best practices for schema design. The metadata fields are as follows:
 
 - `$schema`: Indicates the version of the json-schema standard that the current schema follows. Leave this field as it is.
-- `$id`: Provides a global ID for this schema anchored in a web domain, e.g., that of your organization. You may wish to edit this if you expect your schema to be widely used, but it can be left as it is for use inside Intelligence Toolkit.
 - `title`: A short title indicating the kind of data that the schema represents.
 - `description`: A longer description of the kind of data that the schema represents.
+- `records`: The collection of records represented by the schema.
 
 Try editing some of these metadata fields now, and see them reflected in the `Preview` of the `JSON schema` to the right. In particular, set the title field to `Customer Complaints`.
 
 The schema in progress is validated after every change, with the message `Schema is valid` confirming that the current schema conforms to the standard specified in the `$schema` field.
 
 Try downloading an edited schema using the download button, uploading it via the `Upload schema` control, then continuing as below.
 
-### Creating the record collection
-
-Now select the `Sample object` tab, and notice how none of these fields are contained in the sample object itself. We can understand this by going back to the `JSON schema` tab and seeing that the schema is of type `object` and that the `properties` of the object are currently empty, indicated by the empty braces `{}`. Whatever we add to the `properties` of the top-level object in the schema gets added to the `Sample object` (and indeed to any objects that conform to the schema).
-
-Let's now add some fields to the object using the buttons under `Add top-level field` in the form to the left.
-
-To create a dataset of records rather than a single object, the schema needs to contain an object array field. Press the `obj[]` button to add an object array field at the top level (i.e., level 0). The new field will be given a generic name by default: `object_array_1`. Rename this to `complaint_records` and see on the right how this creates an array of objects whose properties you can define next.
-
-Note that all new fields have the `Required?` checkbox checked by default, placing all field names in the `required` field of the object. This is a requirement for the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas), which we'll later use to generate mock data that follows the schema. Similarly, all objects must also have `additionalProperties` set to `false`, so the `Additional?` checkbox is left unchecked by default.
-
 ### Defining record attributes
 
+Let's now specify the type of records represented by the schema by renaming `records` to `complaint_records` in the form to the left.
+
 Next, we need to add fields to the objects of `complaint_records` for each attribute of the records we want to create.
 
 Using the controls under `Add field to complaint_records`, press the `str` button to add a string (i.e., text) field. This field appears as the level 1 string `string_1` (level 1 because the field is nested one level down from the `complaint_records` array at the top level, i.e., level 0). Edit the text label from `string_1` to `name`.
@@ -67,6 +59,8 @@ As further string fields within `complaint_records`, now add:
 - `email` as string field
 - `price_issue`, `quality_issue`, `service_issue`, `delivery_issue`, `description_issue` as boolean (`true`/`false`) fields using the `bool` button
 
+Note that all new fields have the `Required?` checkbox checked by default, placing all field names in the `required` field of the object. This is a requirement for the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas), which we'll later use to generate mock data that follows the schema. Similarly, all objects must also have `additionalProperties` set to `false`, so the `Additional?` checkbox is left unchecked by default.
+
 Next, we want to add a `product_code` string field, but limit the possible values of the field to a predefined list called an "enumeration". Do this by checking the `Enum?` checkbox and observing the default values `A`, `B`, and `C` added to the enumeration. These values can be edited, deleted, and expanded as desired. For this tutorial, simply add further enum values alphabetically from `D` to `H`.
 
 Note that boolean attributes of the record could also have been created using the `str[]` button to create a string array, checking `Enum?`, and specifying `price_issue`, `quality_issue`, `service_issue`, `delivery_issue`, `description_issue` as possible values. However, by using independent boolean fields we simplify the overall record structure and avoid the challenges of nested arrays in the final data object.

diff --git a/app/workflows/generate_mock_data/variables.py b/app/workflows/generate_mock_data/variables.py
@@ -3,7 +3,7 @@
 #
 import streamlit as st
 from app.util.session_variable import SessionVariable
-from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema
+from toolkit.generate_mock_data import GenerateMockData, create_boilerplate_schema
 
 class SessionVariables:
     prefix = None
@@ -13,10 +13,10 @@ def __init__(self, prefix):
         self.create_session(prefix)
 
     def create_session(self, prefix):
+        self.workflow_object = SessionVariable(GenerateMockData(), prefix)
         self.schema = SessionVariable(create_boilerplate_schema(), prefix)
         self.num_records_overall = SessionVariable(100, prefix)
         self.records_per_batch = SessionVariable(20, prefix)
-        self.parallel_batches = SessionVariable(5, prefix)
         self.duplicate_records_per_batch = SessionVariable(0, prefix)
         self.related_records_per_batch = SessionVariable(0, prefix)
         self.primary_record_array = SessionVariable('', prefix)
@@ -31,8 +31,8 @@ def create_session(self, prefix):
         self.generated_text_df = SessionVariable(None, prefix)
         self.uploaded_synthesis_files = SessionVariable([], prefix)
         self.synthesis_max_rows_to_process = SessionVariable(0, prefix)
-        self.text_synthesis_temperature = SessionVariable(0.7, prefix)
-        self.record_synthesis_temperature = SessionVariable(0.7, prefix)
+        self.text_synthesis_temperature = SessionVariable(0.5, prefix)
+        self.record_synthesis_temperature = SessionVariable(0.5, prefix)
         self.input_texts = SessionVariable([], prefix)
 
     def reset_workflow(self):

diff --git a/app/workflows/generate_mock_data/workflow.py b/app/workflows/generate_mock_data/workflow.py
@@ -8,8 +8,7 @@
 import app.util.schema_ui as schema_ui
 import app.util.ui_components as ui_components
 import app.workflows.generate_mock_data.variables as bds_variables
-import toolkit.generate_mock_data.data_generator as data_generator
-import toolkit.generate_mock_data.text_generator as text_generator
+from toolkit.generate_mock_data import GenerateMockData
 from app.util.download_pdf import add_download_pdf
 from app.util.openai_wrapper import UIOpenAIConfiguration
 
@@ -23,7 +22,8 @@ def get_intro():
 
 async def create(sv: bds_variables.SessionVariables, workflow: None):
     ui_components.check_ai_configuration()
-
+    gmd: GenerateMockData = sv.workflow_object.value
+    gmd.set_ai_configuration(ai_configuration)
     intro_tab, schema_tab, record_generator_tab, text_generator_tab, mock_tab = st.tabs(['Generate Mock Data workflow:', 'Prepare data schema', 'Generate mock records', 'Generate mock texts', 'View example outputs'])
     with intro_tab:
         file_content = get_intro()
@@ -35,49 +35,29 @@ async def create(sv: bds_variables.SessionVariables, workflow: None):
         )
     with schema_tab:
         sv.loaded_filename.value = schema_ui.build_schema_ui(sv.schema.value, sv.loaded_filename.value)
+        gmd.set_schema(sv.schema.value)
+        sv.record_arrays.value = [".".join(x) for x in gmd.record_arrays]
     with record_generator_tab:
         if len(sv.schema.value['properties']) == 0:
             st.warning("Prepare data schema to continue.")
         else:
             st.markdown("##### Data generation controls")
-            c1, c2, c3, c4, c5, c6 = st.columns(6)
+            c1, c2, c3, c4 = st.columns(4)
             with c1:
-                array_field_arrays = data_generator.extract_array_fields(sv.schema.value)
-                sv.record_arrays.value = ['.'.join(a) for a in array_field_arrays]
-                st.selectbox("Primary record array", sv.record_arrays.value, key=sv.primary_record_array.key,
-                             help="In the presence of multiple arrays, select the one that represents the primary record type whose records should be counted towards the `Total records to generate` target")
-            with c2:
                 st.number_input("Records per batch", min_value=1, value=sv.records_per_batch.value, key=sv.records_per_batch.key,
                                 help="How many records to generate in a single LLM call")
-            with c3:
-
-                def on_change_batches_num() -> None:
-                    sv.num_records_overall.value = (
-                        sv.records_per_batch.value * sv.parallel_batches.value
-                    )
-
-                st.number_input(
-                    "Parallel batches",
-                    min_value=0,
-                    step=1,
-                    value=sv.parallel_batches.value,
-                    on_change=on_change_batches_num,
-                    key=sv.parallel_batches.key,
-                    help="In a single iteration, how many batches to generate via parallel LLM calls",
-                )
-            with c4:
+            with c2:
                 st.number_input(
                     "Total records to generate",
-                    min_value=sv.records_per_batch.value * sv.parallel_batches.value,
-                    step=sv.records_per_batch.value * sv.parallel_batches.value,
+                    min_value=sv.records_per_batch.value,
                     value=sv.num_records_overall.value,
                     key=sv.num_records_overall.key,
                     help="How many records to generate. Must be a multiple of `Records per batch` x `Parallel batches`",
                 )
-            with c5:
+            with c3:
                 st.number_input("Duplicate records per batch", min_value=0, value=sv.duplicate_records_per_batch.value, key=sv.duplicate_records_per_batch.key,
                                 help="Within each batch, how many records should be near-duplicates of a seed record randomly selected from existing records")
-            with c6:
+            with c4:
                 st.number_input("Related records per batch", min_value=0, value=sv.related_records_per_batch.value, key=sv.related_records_per_batch.key,
                                 help="Within each batch, how many records should appear closely related to (but not the same as) a seed record randomly selected from existing records")
             st.text_area("AI data generation guidance", key=sv.generation_guidance.key, value=sv.generation_guidance.value,
@@ -110,21 +90,18 @@ def on_dfs_update(path_to_df):
                     for placeholder in df_placeholders:
                         placeholder.empty()
 
-                    sv.final_object.value, sv.generated_objects.value, sv.generated_dfs.value = await data_generator.generate_data(
-                        ai_configuration=ai_configuration,
+                    await gmd.generate_data_records(
                         generation_guidance=sv.generation_guidance.value,
-                        primary_record_array=sv.primary_record_array.value,
-                        record_arrays=sv.record_arrays.value,
                         num_records_overall=sv.num_records_overall.value,
                         records_per_batch=sv.records_per_batch.value,
-                        parallel_batches=sv.parallel_batches.value,
                         duplicate_records_per_batch=sv.duplicate_records_per_batch.value,
                         related_records_per_batch=sv.related_records_per_batch.value,
-                        data_schema=sv.schema.value,
                         temperature=sv.record_synthesis_temperature.value,
                         df_update_callback=on_dfs_update,
                         callback_batch=None
                     )
+                    sv.final_object.value = gmd.json_object
+                    sv.generated_dfs.value = gmd.array_dfs
 
                 for ix, record_array in enumerate(sv.record_arrays.value):
                         with df_placeholders[ix]:
@@ -167,7 +144,7 @@ def on_dfs_update(path_to_df):
             if changed:
                 if selected_df is not None:
                     sv.input_texts.value = []
-                    for ix, row in selected_df.iterrows():
+                    for _, row in selected_df.iterrows():
                         sv.input_texts.value.append(row.to_json())
                 sv.generated_texts.value = []
             st.text_area("AI text generation guidance", key=sv.text_generation_guidance.key, value=sv.text_generation_guidance.value,
@@ -195,18 +172,15 @@ def on_dfs_update(df):
                 sv.generated_texts.value = pd.DataFrame()
                 df_placeholder.empty()
 
-                (
-                    sv.generated_texts.value,
-                    sv.generated_text_df.value
-                ) = await text_generator.generate_text_data(
-                    ai_configuration=ai_configuration,
-                    input_texts=sv.input_texts.value,
+                await gmd.generate_text_data(
+                    df=selected_df,
                     generation_guidance=sv.text_generation_guidance.value,
                     temperature=sv.text_synthesis_temperature.value,
-                    df_update_callback=on_dfs_update,
-                    parallel_threads=10,
-                    callback_batch=None
+                    df_update_callback=on_dfs_update
                 )
+                sv.generated_texts.value = gmd.text_list
+                sv.generated_text_df.value = gmd.text_df
+
 
             if sv.generated_text_df.value is not None and selected_file is not None:
                 with df_placeholder: