Skip to content

Commit

Permalink
Structured data apis (#68)
Browse files Browse the repository at this point in the history
* Updated GMD workflow to use API

* Example notebook for GMD

* ERD API and notebook

---------

Co-authored-by: Darren Edge <[email protected]>
Co-authored-by: Dayenne Souza <[email protected]>
  • Loading branch information
3 people authored Oct 23, 2024
1 parent 6aba99b commit e3d37a8
Show file tree
Hide file tree
Showing 22 changed files with 771 additions and 138 deletions.
2 changes: 1 addition & 1 deletion app/util/schema_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def generate_form_from_json_schema(global_schema, default_schema, field_location
return
else:
if key != 'type':
new_value = st.text_input(f'`{key}` metadata', key=f'{key_with_prefix}_label', value=value)
new_value = st.text_input(f'{key}', key=f'{key_with_prefix}_label', value=value)
if new_value != value:
field_location[key] = new_value
st.rerun()
Expand Down
4 changes: 2 additions & 2 deletions app/workflows/extract_record_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ Select the `View example outputs` tab (in app) or navigate to [example_outputs/e

1. [**Input**] An instance or collection of unstructured text and (optionally) an existing JSON file containing the JSON schema with which to generate output records.
2. [**Process**] The user edits the uploaded JSON schema or creates one interactively.
3. [**AI Calls**] The system uses generative AI to extract a JSON object from the text following the JSON schema.
3. [**AI Calls**] The system uses generative AI to extract a JSON object from each text following the JSON schema.
4. [**Output**] A dataset of structured records following the JSON schema and (optionally) a newly-defined JSON schema.

## Input requirements

- The input schema, if provided, should be a JSON file conforming to the [JSON schema standard](https://json-schema.org/) and following the restrictions of the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas).
- The unstructured text is sent to the AI API for record extraction (either OpenAI or Azure OpenAI). Such data use must comply with all applicable laws, regulations, and policies, including those pertaining to privacy and security.

## Use with other workflows

Expand Down Expand Up @@ -41,7 +42,6 @@ In the top left, you have the option to upload an existing JSON schema. This is
The initial JSON schema contains some boilerplate metadata fields representing best practices for schema design. The metadata fields are as follows:

- `$schema`: Indicates the version of the json-schema standard that the current schema follows. Leave this field as it is.
- `$id`: Provides a global ID for this schema anchored in a web domain, e.g., that of your organization. You may wish to edit this if you expect your schema to be widely used, but it can be left as it is for use inside Intelligence Toolkit.
- `title`: A short title indicating the kind of data that the schema represents.
- `description`: A longer description of the kind of data that the schema represents.

Expand Down
2 changes: 2 additions & 0 deletions app/workflows/extract_record_data/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#
import streamlit as st
from app.util.session_variable import SessionVariable
from toolkit.extract_record_data import ExtractRecordData
from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema

class SessionVariables:
Expand All @@ -13,6 +14,7 @@ def __init__(self, prefix):
self.create_session(prefix)

def create_session(self, prefix):
self.workflow_object = SessionVariable(ExtractRecordData(), prefix)
self.schema = SessionVariable(create_boilerplate_schema(), prefix)
self.loaded_schema_filename = SessionVariable('', prefix)
self.loaded_data_filename = SessionVariable('', prefix)
Expand Down
20 changes: 9 additions & 11 deletions app/workflows/extract_record_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def get_intro():

async def create(sv: variables.SessionVariables, workflow: None):
ui_components.check_ai_configuration()

erd = sv.workflow_object.value
erd.set_ai_configuration(ai_configuration)
intro_tab, schema_tab, generator_tab, mock_tab = st.tabs(['Extract Record Data workflow:', 'Prepare data schema', 'Extract structured records', 'View example outputs'])
with intro_tab:
file_content = get_intro()
Expand All @@ -35,8 +36,8 @@ async def create(sv: variables.SessionVariables, workflow: None):
with schema_tab:
sv.loaded_schema_filename.value = schema_ui.build_schema_ui(
sv.schema.value, sv.loaded_schema_filename.value)
array_field_arrays = data_extractor.extract_array_fields(sv.schema.value)
sv.record_arrays.value = ['.'.join(a) for a in array_field_arrays]
erd.set_schema(sv.schema.value)
sv.record_arrays.value = [".".join(x) for x in erd.record_arrays]
with generator_tab:
d1, d2 = st.columns([1, 1])
with d1:
Expand Down Expand Up @@ -92,19 +93,16 @@ def on_dfs_update(path_to_df):
for placeholder in df_placeholders:
placeholder.empty()

(
sv.final_object.value,
sv.generated_objects.value,
sv.generated_dfs.value
) = await data_extractor.extract_record_data(
ai_configuration=ai_configuration,


await erd.extract_record_data(
input_texts=input_texts,
generation_guidance=sv.generation_guidance.value,
record_arrays=sv.record_arrays.value,
data_schema=sv.schema.value,
df_update_callback=on_dfs_update,
callback_batch=None
)
sv.final_object.value = erd.json_object,
sv.generated_dfs.value = erd.array_dfs

for ix, record_array in enumerate(sv.record_arrays.value):
with df_placeholders[ix]:
Expand Down
16 changes: 5 additions & 11 deletions app/workflows/generate_mock_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,20 @@ In the top left, you have the option to upload an existing JSON schema. This is
The initial JSON schema contains some boilerplate metadata fields representing best practices for schema design. The metadata fields are as follows:

- `$schema`: Indicates the version of the json-schema standard that the current schema follows. Leave this field as it is.
- `$id`: Provides a global ID for this schema anchored in a web domain, e.g., that of your organization. You may wish to edit this if you expect your schema to be widely used, but it can be left as it is for use inside Intelligence Toolkit.
- `title`: A short title indicating the kind of data that the schema represents.
- `description`: A longer description of the kind of data that the schema represents.
- `records`: The collection of records represented by the schema.

Try editing some of these metadata fields now, and see them reflected in the `Preview` of the `JSON schema` to the right. In particular, set the title field to `Customer Complaints`.

The schema in progress is validated after every change, with the message `Schema is valid` confirming that the current schema conforms to the standard specified in the `$schema` field.

Try downloading an edited schema using the download button, uploading it via the `Upload schema` control, then continuing as below.

### Creating the record collection

Now select the `Sample object` tab, and notice how none of these fields are contained in the sample object itself. We can understand this by going back to the `JSON schema` tab and seeing that the schema is of type `object` and that the `properties` of the object are currently empty, indicated by the empty braces `{}`. Whatever we add to the `properties` of the top-level object in the schema gets added to the `Sample object` (and indeed to any objects that conform to the schema).

Let's now add some fields to the object using the buttons under `Add top-level field` in the form to the left.

To create a dataset of records rather than a single object, the schema needs to contain an object array field. Press the `obj[]` button to add an object array field at the top level (i.e., level 0). The new field will be given a generic name by default: `object_array_1`. Rename this to `complaint_records` and see on the right how this creates an array of objects whose properties you can define next.

Note that all new fields have the `Required?` checkbox checked by default, placing all field names in the `required` field of the object. This is a requirement for the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas), which we'll later use to generate mock data that follows the schema. Similarly, all objects must also have `additionalProperties` set to `false`, so the `Additional?` checkbox is left unchecked by default.

### Defining record attributes

Let's now specify the type of records represented by the schema by renaming `records` to `complaint_records` in the form to the left.

Next, we need to add fields to the objects of `complaint_records` for each attribute of the records we want to create.

Using the controls under `Add field to complaint_records`, press the `str` button to add a string (i.e., text) field. This field appears as the level 1 string `string_1` (level 1 because the field is nested one level down from the `complaint_records` array at the top level, i.e., level 0). Edit the text label from `string_1` to `name`.
Expand All @@ -67,6 +59,8 @@ As further string fields within `complaint_records`, now add:
- `email` as string field
- `price_issue`, `quality_issue`, `service_issue`, `delivery_issue`, `description_issue` as boolean (`true`/`false`) fields using the `bool` button

Note that all new fields have the `Required?` checkbox checked by default, placing all field names in the `required` field of the object. This is a requirement for the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas), which we'll later use to generate mock data that follows the schema. Similarly, all objects must also have `additionalProperties` set to `false`, so the `Additional?` checkbox is left unchecked by default.

Next, we want to add a `product_code` string field, but limit the possible values of the field to a predefined list called an "enumeration". Do this by checking the `Enum?` checkbox and observing the default values `A`, `B`, and `C` added to the enumeration. These values can be edited, deleted, and expanded as desired. For this tutorial, simply add further enum values alphabetically from `D` to `H`.

Note that boolean attributes of the record could also have been created using the `str[]` button to create a string array, checking `Enum?`, and specifying `price_issue`, `quality_issue`, `service_issue`, `delivery_issue`, `description_issue` as possible values. However, by using independent boolean fields we simplify the overall record structure and avoid the challenges of nested arrays in the final data object.
Expand Down
8 changes: 4 additions & 4 deletions app/workflows/generate_mock_data/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
import streamlit as st
from app.util.session_variable import SessionVariable
from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema
from toolkit.generate_mock_data import GenerateMockData, create_boilerplate_schema

class SessionVariables:
prefix = None
Expand All @@ -13,10 +13,10 @@ def __init__(self, prefix):
self.create_session(prefix)

def create_session(self, prefix):
self.workflow_object = SessionVariable(GenerateMockData(), prefix)
self.schema = SessionVariable(create_boilerplate_schema(), prefix)
self.num_records_overall = SessionVariable(100, prefix)
self.records_per_batch = SessionVariable(20, prefix)
self.parallel_batches = SessionVariable(5, prefix)
self.duplicate_records_per_batch = SessionVariable(0, prefix)
self.related_records_per_batch = SessionVariable(0, prefix)
self.primary_record_array = SessionVariable('', prefix)
Expand All @@ -31,8 +31,8 @@ def create_session(self, prefix):
self.generated_text_df = SessionVariable(None, prefix)
self.uploaded_synthesis_files = SessionVariable([], prefix)
self.synthesis_max_rows_to_process = SessionVariable(0, prefix)
self.text_synthesis_temperature = SessionVariable(0.7, prefix)
self.record_synthesis_temperature = SessionVariable(0.7, prefix)
self.text_synthesis_temperature = SessionVariable(0.5, prefix)
self.record_synthesis_temperature = SessionVariable(0.5, prefix)
self.input_texts = SessionVariable([], prefix)

def reset_workflow(self):
Expand Down
66 changes: 20 additions & 46 deletions app/workflows/generate_mock_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import app.util.schema_ui as schema_ui
import app.util.ui_components as ui_components
import app.workflows.generate_mock_data.variables as bds_variables
import toolkit.generate_mock_data.data_generator as data_generator
import toolkit.generate_mock_data.text_generator as text_generator
from toolkit.generate_mock_data import GenerateMockData
from app.util.download_pdf import add_download_pdf
from app.util.openai_wrapper import UIOpenAIConfiguration

Expand All @@ -23,7 +22,8 @@ def get_intro():

async def create(sv: bds_variables.SessionVariables, workflow: None):
ui_components.check_ai_configuration()

gmd: GenerateMockData = sv.workflow_object.value
gmd.set_ai_configuration(ai_configuration)
intro_tab, schema_tab, record_generator_tab, text_generator_tab, mock_tab = st.tabs(['Generate Mock Data workflow:', 'Prepare data schema', 'Generate mock records', 'Generate mock texts', 'View example outputs'])
with intro_tab:
file_content = get_intro()
Expand All @@ -35,49 +35,29 @@ async def create(sv: bds_variables.SessionVariables, workflow: None):
)
with schema_tab:
sv.loaded_filename.value = schema_ui.build_schema_ui(sv.schema.value, sv.loaded_filename.value)
gmd.set_schema(sv.schema.value)
sv.record_arrays.value = [".".join(x) for x in gmd.record_arrays]
with record_generator_tab:
if len(sv.schema.value['properties']) == 0:
st.warning("Prepare data schema to continue.")
else:
st.markdown("##### Data generation controls")
c1, c2, c3, c4, c5, c6 = st.columns(6)
c1, c2, c3, c4 = st.columns(4)
with c1:
array_field_arrays = data_generator.extract_array_fields(sv.schema.value)
sv.record_arrays.value = ['.'.join(a) for a in array_field_arrays]
st.selectbox("Primary record array", sv.record_arrays.value, key=sv.primary_record_array.key,
help="In the presence of multiple arrays, select the one that represents the primary record type whose records should be counted towards the `Total records to generate` target")
with c2:
st.number_input("Records per batch", min_value=1, value=sv.records_per_batch.value, key=sv.records_per_batch.key,
help="How many records to generate in a single LLM call")
with c3:

def on_change_batches_num() -> None:
sv.num_records_overall.value = (
sv.records_per_batch.value * sv.parallel_batches.value
)

st.number_input(
"Parallel batches",
min_value=0,
step=1,
value=sv.parallel_batches.value,
on_change=on_change_batches_num,
key=sv.parallel_batches.key,
help="In a single iteration, how many batches to generate via parallel LLM calls",
)
with c4:
with c2:
st.number_input(
"Total records to generate",
min_value=sv.records_per_batch.value * sv.parallel_batches.value,
step=sv.records_per_batch.value * sv.parallel_batches.value,
min_value=sv.records_per_batch.value,
value=sv.num_records_overall.value,
key=sv.num_records_overall.key,
help="How many records to generate. Must be a multiple of `Records per batch` x `Parallel batches`",
)
with c5:
with c3:
st.number_input("Duplicate records per batch", min_value=0, value=sv.duplicate_records_per_batch.value, key=sv.duplicate_records_per_batch.key,
help="Within each batch, how many records should be near-duplicates of a seed record randomly selected from existing records")
with c6:
with c4:
st.number_input("Related records per batch", min_value=0, value=sv.related_records_per_batch.value, key=sv.related_records_per_batch.key,
help="Within each batch, how many records should appear closely related to (but not the same as) a seed record randomly selected from existing records")
st.text_area("AI data generation guidance", key=sv.generation_guidance.key, value=sv.generation_guidance.value,
Expand Down Expand Up @@ -110,21 +90,18 @@ def on_dfs_update(path_to_df):
for placeholder in df_placeholders:
placeholder.empty()

sv.final_object.value, sv.generated_objects.value, sv.generated_dfs.value = await data_generator.generate_data(
ai_configuration=ai_configuration,
await gmd.generate_data_records(
generation_guidance=sv.generation_guidance.value,
primary_record_array=sv.primary_record_array.value,
record_arrays=sv.record_arrays.value,
num_records_overall=sv.num_records_overall.value,
records_per_batch=sv.records_per_batch.value,
parallel_batches=sv.parallel_batches.value,
duplicate_records_per_batch=sv.duplicate_records_per_batch.value,
related_records_per_batch=sv.related_records_per_batch.value,
data_schema=sv.schema.value,
temperature=sv.record_synthesis_temperature.value,
df_update_callback=on_dfs_update,
callback_batch=None
)
sv.final_object.value = gmd.json_object
sv.generated_dfs.value = gmd.array_dfs

for ix, record_array in enumerate(sv.record_arrays.value):
with df_placeholders[ix]:
Expand Down Expand Up @@ -167,7 +144,7 @@ def on_dfs_update(path_to_df):
if changed:
if selected_df is not None:
sv.input_texts.value = []
for ix, row in selected_df.iterrows():
for _, row in selected_df.iterrows():
sv.input_texts.value.append(row.to_json())
sv.generated_texts.value = []
st.text_area("AI text generation guidance", key=sv.text_generation_guidance.key, value=sv.text_generation_guidance.value,
Expand Down Expand Up @@ -195,18 +172,15 @@ def on_dfs_update(df):
sv.generated_texts.value = pd.DataFrame()
df_placeholder.empty()

(
sv.generated_texts.value,
sv.generated_text_df.value
) = await text_generator.generate_text_data(
ai_configuration=ai_configuration,
input_texts=sv.input_texts.value,
await gmd.generate_text_data(
df=selected_df,
generation_guidance=sv.text_generation_guidance.value,
temperature=sv.text_synthesis_temperature.value,
df_update_callback=on_dfs_update,
parallel_threads=10,
callback_batch=None
df_update_callback=on_dfs_update
)
sv.generated_texts.value = gmd.text_list
sv.generated_text_df.value = gmd.text_df


if sv.generated_text_df.value is not None and selected_file is not None:
with df_placeholder:
Expand Down
Loading

0 comments on commit e3d37a8

Please sign in to comment.