From e3d37a818ed58bdabfa31156e18768541b2d22c7 Mon Sep 17 00:00:00 2001 From: Darren Edge Date: Wed, 23 Oct 2024 17:03:18 +0100 Subject: [PATCH] Structured data apis (#68) * Updated GMD workflow to use API * Example notebook for GMD * ERD API and notebook --------- Co-authored-by: Darren Edge Co-authored-by: Dayenne Souza --- app/util/schema_ui.py | 2 +- app/workflows/extract_record_data/README.md | 4 +- .../extract_record_data/variables.py | 2 + app/workflows/extract_record_data/workflow.py | 20 +- app/workflows/generate_mock_data/README.md | 16 +- app/workflows/generate_mock_data/variables.py | 8 +- app/workflows/generate_mock_data/workflow.py | 66 ++--- example_notebooks/extract_record_data.ipynb | 232 +++++++++++++++ example_notebooks/generate_mock_data.ipynb | 270 ++++++++++++++++++ .../company_grievances_schema.json | 1 - .../customer_complaints_schema.json | 1 - .../company_grievances_schema.json | 1 - .../customer_complaints_schema.json | 1 - .../news_articles/news_articles_schema.json | 1 - toolkit/extract_record_data/__init__.py | 16 ++ toolkit/extract_record_data/api.py | 51 ++++ toolkit/extract_record_data/data_extractor.py | 8 +- toolkit/generate_mock_data/__init__.py | 12 +- toolkit/generate_mock_data/api.py | 91 ++++++ toolkit/generate_mock_data/data_generator.py | 25 +- toolkit/generate_mock_data/schema_builder.py | 18 +- toolkit/generate_mock_data/text_generator.py | 63 ++-- 22 files changed, 771 insertions(+), 138 deletions(-) create mode 100644 example_notebooks/extract_record_data.ipynb create mode 100644 example_notebooks/generate_mock_data.ipynb create mode 100644 toolkit/extract_record_data/__init__.py create mode 100644 toolkit/extract_record_data/api.py create mode 100644 toolkit/generate_mock_data/api.py diff --git a/app/util/schema_ui.py b/app/util/schema_ui.py index 1383526a..8a830a61 100644 --- a/app/util/schema_ui.py +++ b/app/util/schema_ui.py @@ -153,7 +153,7 @@ def generate_form_from_json_schema(global_schema, default_schema, field_location return else: if key != 'type': - new_value = st.text_input(f'`{key}` metadata', key=f'{key_with_prefix}_label', value=value) + new_value = st.text_input(f'{key}', key=f'{key_with_prefix}_label', value=value) if new_value != value: field_location[key] = new_value st.rerun() diff --git a/app/workflows/extract_record_data/README.md b/app/workflows/extract_record_data/README.md index b94781bf..541f89e8 100644 --- a/app/workflows/extract_record_data/README.md +++ b/app/workflows/extract_record_data/README.md @@ -8,12 +8,13 @@ Select the `View example outputs` tab (in app) or navigate to [example_outputs/e 1. [**Input**] An instance or collection of unstructured text and (optionally) an existing JSON file containing the JSON schema with which to generate output records. 2. [**Process**] The user edits the uploaded JSON schema or creates one interactively. -3. [**AI Calls**] The system uses generative AI to extract a JSON object from the text following the JSON schema. +3. [**AI Calls**] The system uses generative AI to extract a JSON object from each text following the JSON schema. 4. [**Output**] A dataset of structured records following the JSON schema and (optionally) a newly-defined JSON schema. ## Input requirements - The input schema, if provided, should be a JSON file conforming to the [JSON schema standard](https://json-schema.org/) and following the restrictions of the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas). +- The unstructured text is sent to the AI API for record extraction (either OpenAI or Azure OpenAI). Such data use must comply with all applicable laws, regulations, and policies, including those pertaining to privacy and security. ## Use with other workflows @@ -41,7 +42,6 @@ In the top left, you have the option to upload an existing JSON schema. This is The initial JSON schema contains some boilerplate metadata fields representing best practices for schema design. The metadata fields are as follows: - `$schema`: Indicates the version of the json-schema standard that the current schema follows. Leave this field as it is. -- `$id`: Provides a global ID for this schema anchored in a web domain, e.g., that of your organization. You may wish to edit this if you expect your schema to be widely used, but it can be left as it is for use inside Intelligence Toolkit. - `title`: A short title indicating the kind of data that the schema represents. - `description`: A longer description of the kind of data that the schema represents. diff --git a/app/workflows/extract_record_data/variables.py b/app/workflows/extract_record_data/variables.py index dcedabf0..0d09ac4f 100644 --- a/app/workflows/extract_record_data/variables.py +++ b/app/workflows/extract_record_data/variables.py @@ -3,6 +3,7 @@ # import streamlit as st from app.util.session_variable import SessionVariable +from toolkit.extract_record_data import ExtractRecordData from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema class SessionVariables: @@ -13,6 +14,7 @@ def __init__(self, prefix): self.create_session(prefix) def create_session(self, prefix): + self.workflow_object = SessionVariable(ExtractRecordData(), prefix) self.schema = SessionVariable(create_boilerplate_schema(), prefix) self.loaded_schema_filename = SessionVariable('', prefix) self.loaded_data_filename = SessionVariable('', prefix) diff --git a/app/workflows/extract_record_data/workflow.py b/app/workflows/extract_record_data/workflow.py index cc631d36..50a62fa4 100644 --- a/app/workflows/extract_record_data/workflow.py +++ b/app/workflows/extract_record_data/workflow.py @@ -22,7 +22,8 @@ def get_intro(): async def create(sv: variables.SessionVariables, workflow: None): ui_components.check_ai_configuration() - + erd = sv.workflow_object.value + erd.set_ai_configuration(ai_configuration) intro_tab, schema_tab, generator_tab, mock_tab = st.tabs(['Extract Record Data workflow:', 'Prepare data schema', 'Extract structured records', 'View example outputs']) with intro_tab: file_content = get_intro() @@ -35,8 +36,8 @@ async def create(sv: variables.SessionVariables, workflow: None): with schema_tab: sv.loaded_schema_filename.value = schema_ui.build_schema_ui( sv.schema.value, sv.loaded_schema_filename.value) - array_field_arrays = data_extractor.extract_array_fields(sv.schema.value) - sv.record_arrays.value = ['.'.join(a) for a in array_field_arrays] + erd.set_schema(sv.schema.value) + sv.record_arrays.value = [".".join(x) for x in erd.record_arrays] with generator_tab: d1, d2 = st.columns([1, 1]) with d1: @@ -92,19 +93,16 @@ def on_dfs_update(path_to_df): for placeholder in df_placeholders: placeholder.empty() - ( - sv.final_object.value, - sv.generated_objects.value, - sv.generated_dfs.value - ) = await data_extractor.extract_record_data( - ai_configuration=ai_configuration, + + + await erd.extract_record_data( input_texts=input_texts, generation_guidance=sv.generation_guidance.value, - record_arrays=sv.record_arrays.value, - data_schema=sv.schema.value, df_update_callback=on_dfs_update, callback_batch=None ) + sv.final_object.value = erd.json_object, + sv.generated_dfs.value = erd.array_dfs for ix, record_array in enumerate(sv.record_arrays.value): with df_placeholders[ix]: diff --git a/app/workflows/generate_mock_data/README.md b/app/workflows/generate_mock_data/README.md index c16d1a7a..2ede4790 100644 --- a/app/workflows/generate_mock_data/README.md +++ b/app/workflows/generate_mock_data/README.md @@ -34,9 +34,9 @@ In the top left, you have the option to upload an existing JSON schema. This is The initial JSON schema contains some boilerplate metadata fields representing best practices for schema design. The metadata fields are as follows: - `$schema`: Indicates the version of the json-schema standard that the current schema follows. Leave this field as it is. -- `$id`: Provides a global ID for this schema anchored in a web domain, e.g., that of your organization. You may wish to edit this if you expect your schema to be widely used, but it can be left as it is for use inside Intelligence Toolkit. - `title`: A short title indicating the kind of data that the schema represents. - `description`: A longer description of the kind of data that the schema represents. +- `records`: The collection of records represented by the schema. Try editing some of these metadata fields now, and see them reflected in the `Preview` of the `JSON schema` to the right. In particular, set the title field to `Customer Complaints`. @@ -44,18 +44,10 @@ The schema in progress is validated after every change, with the message `Schema Try downloading an edited schema using the download button, uploading it via the `Upload schema` control, then continuing as below. -### Creating the record collection - -Now select the `Sample object` tab, and notice how none of these fields are contained in the sample object itself. We can understand this by going back to the `JSON schema` tab and seeing that the schema is of type `object` and that the `properties` of the object are currently empty, indicated by the empty braces `{}`. Whatever we add to the `properties` of the top-level object in the schema gets added to the `Sample object` (and indeed to any objects that conform to the schema). - -Let's now add some fields to the object using the buttons under `Add top-level field` in the form to the left. - -To create a dataset of records rather than a single object, the schema needs to contain an object array field. Press the `obj[]` button to add an object array field at the top level (i.e., level 0). The new field will be given a generic name by default: `object_array_1`. Rename this to `complaint_records` and see on the right how this creates an array of objects whose properties you can define next. - -Note that all new fields have the `Required?` checkbox checked by default, placing all field names in the `required` field of the object. This is a requirement for the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas), which we'll later use to generate mock data that follows the schema. Similarly, all objects must also have `additionalProperties` set to `false`, so the `Additional?` checkbox is left unchecked by default. - ### Defining record attributes +Let's now specify the type of records represented by the schema by renaming `records` to `complaint_records` in the form to the left. + Next, we need to add fields to the objects of `complaint_records` for each attribute of the records we want to create. Using the controls under `Add field to complaint_records`, press the `str` button to add a string (i.e., text) field. This field appears as the level 1 string `string_1` (level 1 because the field is nested one level down from the `complaint_records` array at the top level, i.e., level 0). Edit the text label from `string_1` to `name`. @@ -67,6 +59,8 @@ As further string fields within `complaint_records`, now add: - `email` as string field - `price_issue`, `quality_issue`, `service_issue`, `delivery_issue`, `description_issue` as boolean (`true`/`false`) fields using the `bool` button +Note that all new fields have the `Required?` checkbox checked by default, placing all field names in the `required` field of the object. This is a requirement for the [OpenAI Structured Outputs API](https://platform.openai.com/docs/guides/structured-outputs/supported-schemas), which we'll later use to generate mock data that follows the schema. Similarly, all objects must also have `additionalProperties` set to `false`, so the `Additional?` checkbox is left unchecked by default. + Next, we want to add a `product_code` string field, but limit the possible values of the field to a predefined list called an "enumeration". Do this by checking the `Enum?` checkbox and observing the default values `A`, `B`, and `C` added to the enumeration. These values can be edited, deleted, and expanded as desired. For this tutorial, simply add further enum values alphabetically from `D` to `H`. Note that boolean attributes of the record could also have been created using the `str[]` button to create a string array, checking `Enum?`, and specifying `price_issue`, `quality_issue`, `service_issue`, `delivery_issue`, `description_issue` as possible values. However, by using independent boolean fields we simplify the overall record structure and avoid the challenges of nested arrays in the final data object. diff --git a/app/workflows/generate_mock_data/variables.py b/app/workflows/generate_mock_data/variables.py index e1b6b320..2a513cba 100644 --- a/app/workflows/generate_mock_data/variables.py +++ b/app/workflows/generate_mock_data/variables.py @@ -3,7 +3,7 @@ # import streamlit as st from app.util.session_variable import SessionVariable -from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema +from toolkit.generate_mock_data import GenerateMockData, create_boilerplate_schema class SessionVariables: prefix = None @@ -13,10 +13,10 @@ def __init__(self, prefix): self.create_session(prefix) def create_session(self, prefix): + self.workflow_object = SessionVariable(GenerateMockData(), prefix) self.schema = SessionVariable(create_boilerplate_schema(), prefix) self.num_records_overall = SessionVariable(100, prefix) self.records_per_batch = SessionVariable(20, prefix) - self.parallel_batches = SessionVariable(5, prefix) self.duplicate_records_per_batch = SessionVariable(0, prefix) self.related_records_per_batch = SessionVariable(0, prefix) self.primary_record_array = SessionVariable('', prefix) @@ -31,8 +31,8 @@ def create_session(self, prefix): self.generated_text_df = SessionVariable(None, prefix) self.uploaded_synthesis_files = SessionVariable([], prefix) self.synthesis_max_rows_to_process = SessionVariable(0, prefix) - self.text_synthesis_temperature = SessionVariable(0.7, prefix) - self.record_synthesis_temperature = SessionVariable(0.7, prefix) + self.text_synthesis_temperature = SessionVariable(0.5, prefix) + self.record_synthesis_temperature = SessionVariable(0.5, prefix) self.input_texts = SessionVariable([], prefix) def reset_workflow(self): diff --git a/app/workflows/generate_mock_data/workflow.py b/app/workflows/generate_mock_data/workflow.py index 0a06e705..df600695 100644 --- a/app/workflows/generate_mock_data/workflow.py +++ b/app/workflows/generate_mock_data/workflow.py @@ -8,8 +8,7 @@ import app.util.schema_ui as schema_ui import app.util.ui_components as ui_components import app.workflows.generate_mock_data.variables as bds_variables -import toolkit.generate_mock_data.data_generator as data_generator -import toolkit.generate_mock_data.text_generator as text_generator +from toolkit.generate_mock_data import GenerateMockData from app.util.download_pdf import add_download_pdf from app.util.openai_wrapper import UIOpenAIConfiguration @@ -23,7 +22,8 @@ def get_intro(): async def create(sv: bds_variables.SessionVariables, workflow: None): ui_components.check_ai_configuration() - + gmd: GenerateMockData = sv.workflow_object.value + gmd.set_ai_configuration(ai_configuration) intro_tab, schema_tab, record_generator_tab, text_generator_tab, mock_tab = st.tabs(['Generate Mock Data workflow:', 'Prepare data schema', 'Generate mock records', 'Generate mock texts', 'View example outputs']) with intro_tab: file_content = get_intro() @@ -35,49 +35,29 @@ async def create(sv: bds_variables.SessionVariables, workflow: None): ) with schema_tab: sv.loaded_filename.value = schema_ui.build_schema_ui(sv.schema.value, sv.loaded_filename.value) + gmd.set_schema(sv.schema.value) + sv.record_arrays.value = [".".join(x) for x in gmd.record_arrays] with record_generator_tab: if len(sv.schema.value['properties']) == 0: st.warning("Prepare data schema to continue.") else: st.markdown("##### Data generation controls") - c1, c2, c3, c4, c5, c6 = st.columns(6) + c1, c2, c3, c4 = st.columns(4) with c1: - array_field_arrays = data_generator.extract_array_fields(sv.schema.value) - sv.record_arrays.value = ['.'.join(a) for a in array_field_arrays] - st.selectbox("Primary record array", sv.record_arrays.value, key=sv.primary_record_array.key, - help="In the presence of multiple arrays, select the one that represents the primary record type whose records should be counted towards the `Total records to generate` target") - with c2: st.number_input("Records per batch", min_value=1, value=sv.records_per_batch.value, key=sv.records_per_batch.key, help="How many records to generate in a single LLM call") - with c3: - - def on_change_batches_num() -> None: - sv.num_records_overall.value = ( - sv.records_per_batch.value * sv.parallel_batches.value - ) - - st.number_input( - "Parallel batches", - min_value=0, - step=1, - value=sv.parallel_batches.value, - on_change=on_change_batches_num, - key=sv.parallel_batches.key, - help="In a single iteration, how many batches to generate via parallel LLM calls", - ) - with c4: + with c2: st.number_input( "Total records to generate", - min_value=sv.records_per_batch.value * sv.parallel_batches.value, - step=sv.records_per_batch.value * sv.parallel_batches.value, + min_value=sv.records_per_batch.value, value=sv.num_records_overall.value, key=sv.num_records_overall.key, help="How many records to generate. Must be a multiple of `Records per batch` x `Parallel batches`", ) - with c5: + with c3: st.number_input("Duplicate records per batch", min_value=0, value=sv.duplicate_records_per_batch.value, key=sv.duplicate_records_per_batch.key, help="Within each batch, how many records should be near-duplicates of a seed record randomly selected from existing records") - with c6: + with c4: st.number_input("Related records per batch", min_value=0, value=sv.related_records_per_batch.value, key=sv.related_records_per_batch.key, help="Within each batch, how many records should appear closely related to (but not the same as) a seed record randomly selected from existing records") st.text_area("AI data generation guidance", key=sv.generation_guidance.key, value=sv.generation_guidance.value, @@ -110,21 +90,18 @@ def on_dfs_update(path_to_df): for placeholder in df_placeholders: placeholder.empty() - sv.final_object.value, sv.generated_objects.value, sv.generated_dfs.value = await data_generator.generate_data( - ai_configuration=ai_configuration, + await gmd.generate_data_records( generation_guidance=sv.generation_guidance.value, - primary_record_array=sv.primary_record_array.value, - record_arrays=sv.record_arrays.value, num_records_overall=sv.num_records_overall.value, records_per_batch=sv.records_per_batch.value, - parallel_batches=sv.parallel_batches.value, duplicate_records_per_batch=sv.duplicate_records_per_batch.value, related_records_per_batch=sv.related_records_per_batch.value, - data_schema=sv.schema.value, temperature=sv.record_synthesis_temperature.value, df_update_callback=on_dfs_update, callback_batch=None ) + sv.final_object.value = gmd.json_object + sv.generated_dfs.value = gmd.array_dfs for ix, record_array in enumerate(sv.record_arrays.value): with df_placeholders[ix]: @@ -167,7 +144,7 @@ def on_dfs_update(path_to_df): if changed: if selected_df is not None: sv.input_texts.value = [] - for ix, row in selected_df.iterrows(): + for _, row in selected_df.iterrows(): sv.input_texts.value.append(row.to_json()) sv.generated_texts.value = [] st.text_area("AI text generation guidance", key=sv.text_generation_guidance.key, value=sv.text_generation_guidance.value, @@ -195,18 +172,15 @@ def on_dfs_update(df): sv.generated_texts.value = pd.DataFrame() df_placeholder.empty() - ( - sv.generated_texts.value, - sv.generated_text_df.value - ) = await text_generator.generate_text_data( - ai_configuration=ai_configuration, - input_texts=sv.input_texts.value, + await gmd.generate_text_data( + df=selected_df, generation_guidance=sv.text_generation_guidance.value, temperature=sv.text_synthesis_temperature.value, - df_update_callback=on_dfs_update, - parallel_threads=10, - callback_batch=None + df_update_callback=on_dfs_update ) + sv.generated_texts.value = gmd.text_list + sv.generated_text_df.value = gmd.text_df + if sv.generated_text_df.value is not None and selected_file is not None: with df_placeholder: diff --git a/example_notebooks/extract_record_data.ipynb b/example_notebooks/extract_record_data.ipynb new file mode 100644 index 00000000..6323dadd --- /dev/null +++ b/example_notebooks/extract_record_data.ipynb @@ -0,0 +1,232 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extract Record Data\n", + "\n", + "Demonstrates use of the Intelligence Toolkit library to extract schema-aligned data records from unstructured texts.\n", + "\n", + "See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/extract_data_records/README.md) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"..\")\n", + "import os\n", + "from toolkit.extract_record_data import ExtractRecordData\n", + "from toolkit.AI.openai_configuration import OpenAIConfiguration\n", + "import pandas as pd\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded data schema\n", + "{'$schema': 'http://json-schema.org/draft/2020-12/schema', 'title': 'Customer complaints', 'description': 'An example schema storing an array of customer complaints', 'type': 'object', 'properties': {'customer_complaints': {'type': 'array', 'description': 'The list of customers and their complaints', 'items': {'type': 'object', 'description': 'An object list item', 'properties': {'name': {'type': 'string', 'description': 'The name of the customer'}, 'street': {'type': 'string', 'description': 'The street of the customer, including property name/number'}, 'city': {'type': 'string', 'description': 'The city of the customer'}, 'age': {'type': 'number', 'description': 'The age of the customer'}, 'email': {'type': 'string', 'description': 'The email address of the customer'}, 'price_issue': {'type': 'boolean', 'description': 'The complaint is a price issue'}, 'quality_issue': {'type': 'boolean', 'description': 'The complaint is a quality issue'}, 'service_issue': {'type': 'boolean', 'description': 'The complaint is a service issue'}, 'delivery_issue': {'type': 'boolean', 'description': 'The complaint is a delivery issue'}, 'description_issue': {'type': 'boolean', 'description': 'The complaint is a description issue'}, 'product_code': {'type': 'string', 'description': 'The product code targeted by the complaint', 'enum': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']}, 'quarter': {'type': 'string', 'description': 'The quarter in which the complaint was made (since 2020-Q1)'}}, 'required': ['name', 'street', 'city', 'age', 'email', 'price_issue', 'quality_issue', 'service_issue', 'delivery_issue', 'description_issue', 'product_code', 'quarter'], 'additionalProperties': False}}}, 'required': ['customer_complaints'], 'additionalProperties': False}\n" + ] + } + ], + "source": [ + "# Create the workflow object\n", + "erd = ExtractRecordData()\n", + "# Set the AI configuration\n", + "ai_configuration = OpenAIConfiguration(\n", + " {\n", + " \"api_type\": \"OpenAI\",\n", + " \"api_key\": os.environ[\"OPENAI_API_KEY\"],\n", + " \"model\": \"gpt-4o\",\n", + " }\n", + ")\n", + "erd.set_ai_configuration(ai_configuration)\n", + "# Load the data schema\n", + "schema_path = \"../example_outputs/extract_record_data/customer_complaints/customer_complaints_schema.json\"\n", + "json_schema = json.loads(open(schema_path, \"r\").read())\n", + "# Set the schema\n", + "erd.set_schema(json_schema)\n", + "print(\"Loaded data schema\")\n", + "print(json_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded text data\n", + " mock_text\n", + "0 **Customer Service Representative:** Good afte...\n", + "1 **Customer Service Representative:** Good afte...\n", + "2 **Customer Service Representative:** Good afte...\n", + "3 **Customer Service Representative:** Good afte...\n", + "4 **Customer Service Representative:** Good afte...\n", + "5 **Customer Service Representative:** Good afte...\n", + "6 **Customer Service Representative:** Good afte...\n", + "7 **Customer Service Representative:** Good afte...\n", + "8 **Customer Service Representative:** Good afte...\n", + "9 **Customer Service Representative:** Good afte...\n" + ] + } + ], + "source": [ + "# Load the text data (first 10 texts only)\n", + "text_data_path = \"../example_outputs/extract_record_data/customer_complaints/customer_complaints_texts.csv\"\n", + "text_data = pd.read_csv(text_data_path)[:10]\n", + "print(\"Loaded text data\")\n", + "print(text_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10/10 [00:04<00:00, 2.34it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data records\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Extract data records\n", + "await erd.extract_record_data(\n", + " input_texts=text_data['mock_text'].tolist()\n", + ")\n", + "print(\"Extracted data records\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'customer_complaints': [{'name': 'Bob Johnson', 'street': '123 Maple Street', 'city': 'Springfield', 'age': 36, 'email': 'bob.johnson@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q2'}, {'name': 'Alice Johnson', 'street': '456 Oak Avenue', 'city': 'Springfield', 'age': 0, 'email': 'alice.j@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q3'}, {'name': 'Alice Smith', 'street': '789 Pine Road', 'city': 'Springfield', 'age': 0, 'email': 'alice.smith@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q2'}, {'name': 'Alice Johnson', 'street': '123 Maple Street', 'city': 'Shelbyville', 'age': 0, 'email': 'alice.johnson@anothermail.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q2'}, {'name': 'Alice Johnson', 'street': '123 Maple Street', 'city': 'Springfield', 'age': 0, 'email': 'alice.johnson@samplemail.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2023-Q2'}, {'name': 'Charlie Brown', 'street': '321 Elm Street', 'city': 'Shelbyville', 'age': 0, 'email': 'charlie.brown@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2023-Q1'}, {'name': 'Diana Prince', 'street': '987 Cedar Lane', 'city': 'Metropolis', 'age': 0, 'email': 'diana.prince@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2023-Q2'}, {'name': 'Evan Wright', 'street': '654 Birch Boulevard', 'city': 'Gotham', 'age': 30, 'email': 'evan.wright@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'D', 'quarter': '2023-Q3'}, {'name': 'Fiona Apple', 'street': '111 Apple Street', 'city': 'Star City', 'age': 29, 'email': 'fiona.apple@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'E', 'quarter': '2023-Q4'}, {'name': 'George Martin', 'street': '222 Music Avenue', 'city': 'Central City', 'age': 0, 'email': 'george.martin@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'F', 'quarter': '2023-Q1'}]}\n" + ] + } + ], + "source": [ + "# Inspect the data as JSON\n", + "print(erd.json_object)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'customer_complaints': name street city age \\\n", + "0 Bob Johnson 123 Maple Street Springfield 36 \n", + "1 Alice Johnson 456 Oak Avenue Springfield 0 \n", + "2 Alice Smith 789 Pine Road Springfield 0 \n", + "3 Alice Johnson 123 Maple Street Shelbyville 0 \n", + "4 Alice Johnson 123 Maple Street Springfield 0 \n", + "5 Charlie Brown 321 Elm Street Shelbyville 0 \n", + "6 Diana Prince 987 Cedar Lane Metropolis 0 \n", + "7 Evan Wright 654 Birch Boulevard Gotham 30 \n", + "8 Fiona Apple 111 Apple Street Star City 29 \n", + "9 George Martin 222 Music Avenue Central City 0 \n", + "\n", + " email price_issue quality_issue service_issue \\\n", + "0 bob.johnson@example.com False True False \n", + "1 alice.j@example.com False True False \n", + "2 alice.smith@example.com False True False \n", + "3 alice.johnson@anothermail.com True True False \n", + "4 alice.johnson@samplemail.com False True True \n", + "5 charlie.brown@example.com True False False \n", + "6 diana.prince@example.com False False True \n", + "7 evan.wright@example.com False False False \n", + "8 fiona.apple@example.com True False False \n", + "9 george.martin@example.com False True False \n", + "\n", + " delivery_issue description_issue product_code quarter \n", + "0 False False A 2023-Q2 \n", + "1 True False A 2023-Q3 \n", + "2 False True A 2023-Q2 \n", + "3 False True A 2023-Q2 \n", + "4 False False B 2023-Q2 \n", + "5 False False B 2023-Q1 \n", + "6 False False C 2023-Q2 \n", + "7 True False D 2023-Q3 \n", + "8 False True E 2023-Q4 \n", + "9 False False F 2023-Q1 }\n" + ] + } + ], + "source": [ + "# Inspect the data as dataframes (one per array field)\n", + "print(erd.array_dfs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "intelligence-toolkit-lXFNld9n-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_notebooks/generate_mock_data.ipynb b/example_notebooks/generate_mock_data.ipynb new file mode 100644 index 00000000..e5e1a6f8 --- /dev/null +++ b/example_notebooks/generate_mock_data.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Mock Data\n", + "\n", + "Demonstrates use of the Intelligence Toolkit library to generate mock data, both structured records and unstructured texts.\n", + "\n", + "See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/generate_mock_data/README.md) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"..\")\n", + "import os\n", + "from toolkit.generate_mock_data import GenerateMockData\n", + "from toolkit.AI.openai_configuration import OpenAIConfiguration\n", + "import pandas as pd\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded data schema\n", + "{'$schema': 'http://json-schema.org/draft/2020-12/schema', 'title': 'Customer complaints', 'description': 'An example schema storing an array of customer complaints', 'type': 'object', 'properties': {'customer_complaints': {'type': 'array', 'description': 'The list of customers and their complaints', 'items': {'type': 'object', 'description': 'An object list item', 'properties': {'name': {'type': 'string', 'description': 'The name of the customer'}, 'street': {'type': 'string', 'description': 'The street of the customer, including property name/number'}, 'city': {'type': 'string', 'description': 'The city of the customer'}, 'age': {'type': 'number', 'description': 'The age of the customer'}, 'email': {'type': 'string', 'description': 'The email address of the customer'}, 'price_issue': {'type': 'boolean', 'description': 'The complaint is a price issue'}, 'quality_issue': {'type': 'boolean', 'description': 'The complaint is a quality issue'}, 'service_issue': {'type': 'boolean', 'description': 'The complaint is a service issue'}, 'delivery_issue': {'type': 'boolean', 'description': 'The complaint is a delivery issue'}, 'description_issue': {'type': 'boolean', 'description': 'The complaint is a description issue'}, 'product_code': {'type': 'string', 'description': 'The product code targeted by the complaint', 'enum': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']}, 'quarter': {'type': 'string', 'description': 'The quarter in which the complaint was made (since 2020-Q1)'}}, 'required': ['name', 'street', 'city', 'age', 'email', 'price_issue', 'quality_issue', 'service_issue', 'delivery_issue', 'description_issue', 'product_code', 'quarter'], 'additionalProperties': False}}}, 'required': ['customer_complaints'], 'additionalProperties': False}\n" + ] + } + ], + "source": [ + "# Create the workflow object\n", + "gmd = GenerateMockData()\n", + "# Set the AI configuration\n", + "ai_configuration = OpenAIConfiguration(\n", + " {\n", + " \"api_type\": \"OpenAI\",\n", + " \"api_key\": os.environ[\"OPENAI_API_KEY\"],\n", + " \"model\": \"gpt-4o\",\n", + " }\n", + ")\n", + "gmd.set_ai_configuration(ai_configuration)\n", + "# Load the data schema\n", + "schema_path = \"../example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json\"\n", + "json_schema = json.loads(open(schema_path, \"r\").read())\n", + "# Set the schema\n", + "gmd.set_schema(json_schema)\n", + "print(\"Loaded data schema\")\n", + "print(json_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10/10 [00:20<00:00, 2.08s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated data records\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Generate mock data records\n", + "await gmd.generate_data_records(\n", + " num_records_overall=100,\n", + " records_per_batch=10,\n", + " duplicate_records_per_batch=1,\n", + " related_records_per_batch=1,\n", + ")\n", + "print(\"Generated data records\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'customer_complaints': [{'name': 'Alicia Johnson', 'street': '123 Maple St.', 'city': 'Springfield', 'age': 34, 'email': 'alicia.johnson@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q3'}, {'name': 'Robert Johnson', 'street': '124 Maple Street', 'city': 'Springfield', 'age': 36, 'email': 'robert.johnson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q3'}, {'name': 'Emily Clark', 'street': '456 Oak Avenue', 'city': 'Riverside', 'age': 28, 'email': 'emily.clark@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2021-Q4'}, {'name': 'Michael Brown', 'street': '789 Pine Road', 'city': 'Greenfield', 'age': 42, 'email': 'michael.brown@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'C', 'quarter': '2022-Q1'}, {'name': 'Jessica Smith', 'street': '321 Cedar Lane', 'city': 'Lakeside', 'age': 30, 'email': 'jessica.smith@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'D', 'quarter': '2022-Q2'}, {'name': 'David Wilson', 'street': '654 Birch Street', 'city': 'Hilltown', 'age': 37, 'email': 'david.wilson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'E', 'quarter': '2022-Q3'}, {'name': 'Sophia Martinez', 'street': '987 Elm Street', 'city': 'Brookside', 'age': 25, 'email': 'sophia.martinez@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'F', 'quarter': '2022-Q4'}, {'name': 'Liam Johnson', 'street': '123 Maple Street', 'city': 'Springfield', 'age': 40, 'email': 'liam.johnson@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'G', 'quarter': '2023-Q1'}, {'name': 'Olivia Garcia', 'street': '111 Walnut Way', 'city': 'Riverside', 'age': 29, 'email': 'olivia.garcia@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'H', 'quarter': '2023-Q2'}, {'name': 'James Lee', 'street': '222 Spruce Drive', 'city': 'Hilltown', 'age': 45, 'email': 'james.lee@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q3'}, {'name': 'Robert Smith', 'street': '456 Oak Avenue', 'city': 'Rivertown', 'age': 45, 'email': 'robert.smith@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2022-Q1'}, {'name': 'Alice Smith', 'street': '456 Oak Avenue', 'city': 'Rivertown', 'age': 42, 'email': 'alice.smith@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2022-Q2'}, {'name': 'John Doe', 'street': '789 Pine Street', 'city': 'Lakeside', 'age': 30, 'email': 'john.doe@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2022-Q3'}, {'name': 'Jane Roe', 'street': '123 Elm Street', 'city': 'Mountainview', 'age': 35, 'email': 'jane.roe@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': True, 'product_code': 'C', 'quarter': '2022-Q4'}, {'name': 'Charlie Brown', 'street': '321 Maple Avenue', 'city': 'Seaside', 'age': 28, 'email': 'charlie.brown@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'D', 'quarter': '2022-Q1'}, {'name': 'Emily White', 'street': '654 Oak Avenue', 'city': 'Rivertown', 'age': 50, 'email': 'emily.white@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'E', 'quarter': '2022-Q2'}, {'name': 'Samuel Green', 'street': '987 Birch Road', 'city': 'Hilltown', 'age': 40, 'email': 'samuel.green@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'F', 'quarter': '2022-Q3'}, {'name': 'Lisa Black', 'street': '222 Cedar Lane', 'city': 'Riverbend', 'age': 33, 'email': 'lisa.black@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'G', 'quarter': '2022-Q4'}, {'name': 'Michael Blue', 'street': '555 Spruce Street', 'city': 'Forestville', 'age': 29, 'email': 'michael.blue@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'H', 'quarter': '2022-Q1'}, {'name': 'Rachel Grey', 'street': '888 Willow Way', 'city': 'Brookside', 'age': 37, 'email': 'rachel.grey@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'A', 'quarter': '2022-Q2'}, {'name': 'Catherine Brown', 'street': '789 Pine Rd', 'city': 'Lakeside', 'age': 29, 'email': 'catherine.brown@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2020-Q4'}, {'name': 'Michael Brown', 'street': '789 Pine Road', 'city': 'Lakeside', 'age': 31, 'email': 'michael.brown@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2021-Q1'}, {'name': 'Alice Smith', 'street': '123 Maple Avenue', 'city': 'Springfield', 'age': 45, 'email': 'alice.smith@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q2'}, {'name': 'John Doe', 'street': '456 Oak Street', 'city': 'Rivertown', 'age': 38, 'email': 'john.doe@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'B', 'quarter': '2020-Q3'}, {'name': 'Emma Wilson', 'street': '789 Elm Street', 'city': 'Mountainview', 'age': 52, 'email': 'emma.wilson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': True, 'product_code': 'D', 'quarter': '2021-Q4'}, {'name': 'Liam Johnson', 'street': '321 Birch Lane', 'city': 'Greenfield', 'age': 27, 'email': 'liam.johnson@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'E', 'quarter': '2022-Q1'}, {'name': 'Olivia Martinez', 'street': '654 Cedar Court', 'city': 'Lakeside', 'age': 33, 'email': 'olivia.martinez@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'F', 'quarter': '2022-Q2'}, {'name': 'Noah Davis', 'street': '987 Spruce Street', 'city': 'Hilltown', 'age': 40, 'email': 'noah.davis@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'G', 'quarter': '2021-Q1'}, {'name': 'Sophia Hernandez', 'street': '111 Willow Way', 'city': 'Seaside', 'age': 29, 'email': 'sophia.hernandez@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'H', 'quarter': '2020-Q4'}, {'name': 'James Lee', 'street': '222 Palm Avenue', 'city': 'Rivertown', 'age': 36, 'email': 'james.lee@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2020-Q2'}, {'name': 'David Greene', 'street': '321 Birch Lane', 'city': 'Hilltop', 'age': 52, 'email': 'david.greene@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'D', 'quarter': '2023-Q2'}, {'name': 'Sarah Green', 'street': '321 Birch Lane', 'city': 'Hilltop', 'age': 49, 'email': 'sarah.green@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'D', 'quarter': '2023-Q2'}, {'name': 'Emily Johnson', 'street': '456 Oak Street', 'city': 'Riverside', 'age': 34, 'email': 'emily.johnson@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q1'}, {'name': 'Michael Smith', 'street': '789 Pine Avenue', 'city': 'Lakeside', 'age': 45, 'email': 'michael.smith@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2023-Q3'}, {'name': 'Jessica Brown', 'street': '101 Maple Drive', 'city': 'Hilltop', 'age': 29, 'email': 'jessica.brown@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'E', 'quarter': '2023-Q2'}, {'name': 'Robert Wilson', 'street': '202 Cedar Road', 'city': 'Hilltop', 'age': 38, 'email': 'robert.wilson@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2023-Q4'}, {'name': 'Linda Martinez', 'street': '303 Spruce Lane', 'city': 'Riverside', 'age': 56, 'email': 'linda.martinez@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'F', 'quarter': '2023-Q1'}, {'name': 'James Anderson', 'street': '404 Elm Street', 'city': 'Lakeside', 'age': 61, 'email': 'james.anderson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'G', 'quarter': '2023-Q3'}, {'name': 'Patricia Thomas', 'street': '505 Willow Way', 'city': 'Riverside', 'age': 47, 'email': 'patricia.thomas@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'H', 'quarter': '2023-Q4'}, {'name': 'Christopher Garcia', 'street': '606 Cherry Court', 'city': 'Lakeside', 'age': 53, 'email': 'christopher.garcia@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q2'}, {'name': 'Eva W.', 'street': '654 Cedar Dr.', 'city': 'Meadowville', 'age': 41, 'email': 'eva.w@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'E', 'quarter': '2021-Q1'}, {'name': 'John White', 'street': '654 Cedar Drive', 'city': 'Meadowville', 'age': 43, 'email': 'john.white@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'E', 'quarter': '2021-Q1'}, {'name': 'Alice Johnson', 'street': '123 Maple Street', 'city': 'Lakeview', 'age': 35, 'email': 'alice.j@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q2'}, {'name': 'Mark Smith', 'street': '789 Pine Avenue', 'city': 'Riverdale', 'age': 28, 'email': 'mark.smith@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2021-Q3'}, {'name': 'Nancy Drew', 'street': '321 Oak Lane', 'city': 'Hilltown', 'age': 50, 'email': 'nancy.drew@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'C', 'quarter': '2021-Q4'}, {'name': 'Tom Hardy', 'street': '987 Birch Road', 'city': 'Woodland', 'age': 37, 'email': 'tom.h@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'D', 'quarter': '2022-Q1'}, {'name': 'Lucy Hale', 'street': '456 Elm Street', 'city': 'Greenville', 'age': 29, 'email': 'lucy.h@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'F', 'quarter': '2022-Q2'}, {'name': 'George King', 'street': '654 Cedar Drive', 'city': 'Meadowville', 'age': 41, 'email': 'george.king@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'G', 'quarter': '2022-Q3'}, {'name': 'Sophia Brown', 'street': '222 Willow Way', 'city': 'Springfield', 'age': 45, 'email': 'sophia.b@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'H', 'quarter': '2022-Q4'}, {'name': 'James Black', 'street': '111 Cedar Drive', 'city': 'Meadowville', 'age': 39, 'email': 'james.black@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'E', 'quarter': '2023-Q1'}, {'name': 'Frank B.', 'street': '987 Spruce Ct', 'city': 'Brookside', 'age': 37, 'email': 'frank.b@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'F', 'quarter': '2022-Q3'}, {'name': 'John Black', 'street': '123 Maple Avenue', 'city': 'Brookside', 'age': 35, 'email': 'john.black@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'F', 'quarter': '2022-Q3'}, {'name': 'Alice Green', 'street': '456 Oak Street', 'city': 'Springfield', 'age': 29, 'email': 'alice.green@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'A', 'quarter': '2022-Q1'}, {'name': 'Bob Smith', 'street': '789 Pine Lane', 'city': 'Riverside', 'age': 42, 'email': 'bob.smith@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'B', 'quarter': '2023-Q2'}, {'name': 'Cathy Jones', 'street': '321 Cedar Road', 'city': 'Lakeside', 'age': 34, 'email': 'cathy.jones@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': True, 'product_code': 'C', 'quarter': '2021-Q4'}, {'name': 'David Brown', 'street': '654 Birch Boulevard', 'city': 'Greenville', 'age': 50, 'email': 'david.brown@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'D', 'quarter': '2023-Q1'}, {'name': 'Eva White', 'street': '987 Spruce Court', 'city': 'Brookside', 'age': 28, 'email': 'eva.white@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'E', 'quarter': '2022-Q4'}, {'name': 'George King', 'street': '159 Elm Street', 'city': 'Hilltop', 'age': 39, 'email': 'george.king@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'G', 'quarter': '2023-Q3'}, {'name': 'Hannah Lee', 'street': '753 Willow Way', 'city': 'Seaside', 'age': 45, 'email': 'hannah.lee@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'H', 'quarter': '2023-Q2'}, {'name': 'Ian Clark', 'street': '852 Chestnut Circle', 'city': 'Mountainview', 'age': 31, 'email': 'ian.clark@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2020-Q3'}, {'name': 'Grace Li', 'street': '159 Elm St', 'city': 'Riverbend', 'age': 28, 'email': 'grace.li@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'G', 'quarter': '2023-Q1'}, {'name': 'Ethan Lee', 'street': '160 Elm Street', 'city': 'Riverbend', 'age': 32, 'email': 'ethan.lee@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'G', 'quarter': '2023-Q2'}, {'name': 'Sophia Johnson', 'street': '45 Pine Avenue', 'city': 'Mapleton', 'age': 45, 'email': 'sophia.j@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q3'}, {'name': 'Michael Brown', 'street': '78 Oak Lane', 'city': 'Lakeside', 'age': 37, 'email': 'michael.b@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2023-Q1'}, {'name': 'Emily Davis', 'street': '23 Cedar Street', 'city': 'Hillsborough', 'age': 29, 'email': 'emily.d@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2023-Q2'}, {'name': 'James Wilson', 'street': '90 Birch Road', 'city': 'Greenfield', 'age': 52, 'email': 'james.w@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'D', 'quarter': '2023-Q4'}, {'name': 'Olivia Martinez', 'street': '12 Spruce Court', 'city': 'Sunnyvale', 'age': 34, 'email': 'olivia.m@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'E', 'quarter': '2023-Q1'}, {'name': 'Liam Garcia', 'street': '56 Maple Street', 'city': 'Brookfield', 'age': 41, 'email': 'liam.g@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'F', 'quarter': '2023-Q2'}, {'name': 'Ava Rodriguez', 'street': '34 Willow Lane', 'city': 'Riverside', 'age': 26, 'email': 'ava.r@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': True, 'product_code': 'H', 'quarter': '2023-Q3'}, {'name': 'Noah Smith', 'street': '89 Chestnut Boulevard', 'city': 'Meadowville', 'age': 39, 'email': 'noah.s@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q4'}, {'name': 'Henry Adamson', 'street': '753 Willow Lane', 'city': 'Sunnydale', 'age': 51, 'email': 'h.adamson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'H', 'quarter': '2020-Q2'}, {'name': 'Helen Adams', 'street': '755 Willow Way', 'city': 'Sunnydale', 'age': 48, 'email': 'helen.adams@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'H', 'quarter': '2020-Q3'}, {'name': 'John Smith', 'street': '123 Elm Street', 'city': 'Rivertown', 'age': 34, 'email': 'john.smith@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q1'}, {'name': 'Alice Johnson', 'street': '456 Oak Avenue', 'city': 'Lakeview', 'age': 29, 'email': 'alice.j@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2021-Q2'}, {'name': 'Bob Brown', 'street': '789 Pine Road', 'city': 'Hillside', 'age': 42, 'email': 'bob.brown@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2021-Q3'}, {'name': 'Clara White', 'street': '321 Birch Blvd', 'city': 'Seaview', 'age': 37, 'email': 'clara.white@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'D', 'quarter': '2021-Q4'}, {'name': 'David Green', 'street': '654 Cedar Street', 'city': 'Mountainville', 'age': 45, 'email': 'david.green@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'E', 'quarter': '2022-Q1'}, {'name': 'Eva Black', 'street': '987 Maple Lane', 'city': 'Riverside', 'age': 53, 'email': 'eva.black@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'F', 'quarter': '2022-Q2'}, {'name': 'Frank Gray', 'street': '159 Spruce Street', 'city': 'Brookfield', 'age': 39, 'email': 'frank.gray@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'G', 'quarter': '2022-Q3'}, {'name': 'Grace Blue', 'street': '753 Willow Way', 'city': 'Sunnydale', 'age': 50, 'email': 'grace.blue@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'H', 'quarter': '2022-Q4'}, {'name': 'Isabella Clarke', 'street': '246 Ash Blvd', 'city': 'Greenwood', 'age': 33, 'email': 'isabella.clarke@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q3'}, {'name': 'Michael Clark', 'street': '246 Ash Boulevard', 'city': 'Greenwood', 'age': 35, 'email': 'michael.clark@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q3'}, {'name': 'Emily Johnson', 'street': '789 Oak Street', 'city': 'Springfield', 'age': 29, 'email': 'emily.johnson@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'B', 'quarter': '2023-Q1'}, {'name': 'David Smith', 'street': '123 Maple Avenue', 'city': 'Riverton', 'age': 41, 'email': 'david.smith@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'C', 'quarter': '2023-Q2'}, {'name': 'Sophia Brown', 'street': '456 Pine Road', 'city': 'Lakeside', 'age': 37, 'email': 'sophia.brown@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'D', 'quarter': '2023-Q1'}, {'name': 'James Wilson', 'street': '321 Birch Lane', 'city': 'Hilltown', 'age': 28, 'email': 'james.wilson@example.com', 'price_issue': True, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'E', 'quarter': '2023-Q3'}, {'name': 'Olivia Martinez', 'street': '789 Elm Street', 'city': 'Riverbend', 'age': 45, 'email': 'olivia.martinez@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'F', 'quarter': '2023-Q2'}, {'name': 'Liam Taylor', 'street': '654 Cedar Avenue', 'city': 'Mountainview', 'age': 32, 'email': 'liam.taylor@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'G', 'quarter': '2023-Q2'}, {'name': 'Ava Thomas', 'street': '987 Willow Way', 'city': 'Brookside', 'age': 39, 'email': 'ava.thomas@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'H', 'quarter': '2023-Q4'}, {'name': 'Ethan White', 'street': '111 Poplar Street', 'city': 'Meadowland', 'age': 50, 'email': 'ethan.white@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': True, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q1'}, {'name': 'Jackie Milner', 'street': '135 Poplar St.', 'city': 'Forestville', 'age': 44, 'email': 'jackie.milner@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'B', 'quarter': '2022-Q4'}, {'name': 'John Miller', 'street': '137 Poplar Street', 'city': 'Forestville', 'age': 46, 'email': 'john.miller@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2022-Q4'}, {'name': 'Emily Johnson', 'street': '742 Evergreen Terrace', 'city': 'Springfield', 'age': 32, 'email': 'emily.j@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q1'}, {'name': 'Michael Smith', 'street': '100 Main Street', 'city': 'Rivertown', 'age': 29, 'email': 'michael.smith@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'C', 'quarter': '2023-Q2'}, {'name': 'Olivia Brown', 'street': '456 Elm Street', 'city': 'Lakeside', 'age': 36, 'email': 'olivia.b@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'D', 'quarter': '2023-Q3'}, {'name': 'Liam Davis', 'street': '789 Pine Avenue', 'city': 'Hilltown', 'age': 41, 'email': 'liam.d@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'E', 'quarter': '2023-Q1'}, {'name': 'Sophia Wilson', 'street': '321 Oak Lane', 'city': 'Greenfield', 'age': 27, 'email': 'sophia.w@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': True, 'description_issue': False, 'product_code': 'F', 'quarter': '2023-Q2'}, {'name': 'James Martinez', 'street': '654 Maple Road', 'city': 'Riverside', 'age': 50, 'email': 'james.m@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'G', 'quarter': '2023-Q3'}, {'name': 'Isabella Garcia', 'street': '987 Birch Boulevard', 'city': 'Sunnydale', 'age': 39, 'email': 'isabella.g@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'H', 'quarter': '2023-Q1'}, {'name': 'Ethan Martinez', 'street': '123 Cedar Street', 'city': 'Riverbend', 'age': 33, 'email': 'ethan.m@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q4'}]}\n" + ] + } + ], + "source": [ + "# Inspect the data as JSON\n", + "print(gmd.json_object)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'customer_complaints': name street city age \\\n", + "0 Alicia Johnson 123 Maple St. Springfield 34 \n", + "1 Robert Johnson 124 Maple Street Springfield 36 \n", + "2 Emily Clark 456 Oak Avenue Riverside 28 \n", + "3 Michael Brown 789 Pine Road Greenfield 42 \n", + "4 Jessica Smith 321 Cedar Lane Lakeside 30 \n", + ".. ... ... ... ... \n", + "95 Liam Davis 789 Pine Avenue Hilltown 41 \n", + "96 Sophia Wilson 321 Oak Lane Greenfield 27 \n", + "97 James Martinez 654 Maple Road Riverside 50 \n", + "98 Isabella Garcia 987 Birch Boulevard Sunnydale 39 \n", + "99 Ethan Martinez 123 Cedar Street Riverbend 33 \n", + "\n", + " email price_issue quality_issue service_issue \\\n", + "0 alicia.johnson@example.com True False False \n", + "1 robert.johnson@example.com False False True \n", + "2 emily.clark@example.com False True False \n", + "3 michael.brown@example.com False False False \n", + "4 jessica.smith@example.com True True False \n", + ".. ... ... ... ... \n", + "95 liam.d@example.com False True False \n", + "96 sophia.w@example.com False False True \n", + "97 james.m@example.com True False False \n", + "98 isabella.g@example.com False True False \n", + "99 ethan.m@example.com False False True \n", + "\n", + " delivery_issue description_issue product_code quarter \n", + "0 False False A 2021-Q3 \n", + "1 False False A 2021-Q3 \n", + "2 False False B 2021-Q4 \n", + "3 True False C 2022-Q1 \n", + "4 False False D 2022-Q2 \n", + ".. ... ... ... ... \n", + "95 False False E 2023-Q1 \n", + "96 True False F 2023-Q2 \n", + "97 False False G 2023-Q3 \n", + "98 False True H 2023-Q1 \n", + "99 False True A 2023-Q4 \n", + "\n", + "[100 rows x 12 columns]}\n" + ] + } + ], + "source": [ + "# Inspect the data as dataframes (one per array field)\n", + "print(gmd.array_dfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10/10 [00:28<00:00, 2.89s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated text data\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Use the customer_complaints dataframe to generate mock text data (first 10 records only)\n", + "df = gmd.array_dfs[\"customer_complaints\"][:10]\n", + "await gmd.generate_text_data(df)\n", + "print(\"Generated text data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mock_text\n", + "0 **Customer Feedback Report**\\n\\n**Customer Inf...\n", + "1 ---\\n\\n**Customer Service Report**\\n\\n**Custom...\n", + "2 **Customer Feedback Report**\\n\\n**Customer Inf...\n", + "3 **Customer Service Report**\\n\\n**Customer Info...\n", + "4 **Customer Feedback Report**\\n\\n**Customer Inf...\n", + "5 ---\\n\\n**Customer Feedback Report**\\n\\n**Custo...\n", + "6 **Customer Feedback Report**\\n\\n**Customer Inf...\n", + "7 **Customer Feedback Report**\\n\\n**Customer Inf...\n", + "8 **Customer Service Report**\\n\\n**Customer Info...\n", + "9 **Customer Complaint Report**\\n\\n**Customer De...\n" + ] + } + ], + "source": [ + "# Inspect texts as dataframe\n", + "print(gmd.text_df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "intelligence-toolkit-lXFNld9n-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_outputs/extract_record_data/company_grievances/company_grievances_schema.json b/example_outputs/extract_record_data/company_grievances/company_grievances_schema.json index 5ee66f7a..7c3379f7 100644 --- a/example_outputs/extract_record_data/company_grievances/company_grievances_schema.json +++ b/example_outputs/extract_record_data/company_grievances/company_grievances_schema.json @@ -1,6 +1,5 @@ { "$schema": "http://json-schema.org/draft/2020-12/schema", - "$id": "https://yourdomain.com/example.schema.json", "title": "Company schema", "description": "A schema for mock company data generation", "type": "object", diff --git a/example_outputs/extract_record_data/customer_complaints/customer_complaints_schema.json b/example_outputs/extract_record_data/customer_complaints/customer_complaints_schema.json index ab0ac360..4fa33a48 100644 --- a/example_outputs/extract_record_data/customer_complaints/customer_complaints_schema.json +++ b/example_outputs/extract_record_data/customer_complaints/customer_complaints_schema.json @@ -1,6 +1,5 @@ { "$schema": "http://json-schema.org/draft/2020-12/schema", - "$id": "https://yourdomain.com/example.schema.json", "title": "Customer complaints", "description": "An example schema storing an array of customer complaints", "type": "object", diff --git a/example_outputs/generate_mock_data/company_grievances/company_grievances_schema.json b/example_outputs/generate_mock_data/company_grievances/company_grievances_schema.json index 5ee66f7a..7c3379f7 100644 --- a/example_outputs/generate_mock_data/company_grievances/company_grievances_schema.json +++ b/example_outputs/generate_mock_data/company_grievances/company_grievances_schema.json @@ -1,6 +1,5 @@ { "$schema": "http://json-schema.org/draft/2020-12/schema", - "$id": "https://yourdomain.com/example.schema.json", "title": "Company schema", "description": "A schema for mock company data generation", "type": "object", diff --git a/example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json b/example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json index ab0ac360..4fa33a48 100644 --- a/example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json +++ b/example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json @@ -1,6 +1,5 @@ { "$schema": "http://json-schema.org/draft/2020-12/schema", - "$id": "https://yourdomain.com/example.schema.json", "title": "Customer complaints", "description": "An example schema storing an array of customer complaints", "type": "object", diff --git a/example_outputs/generate_mock_data/news_articles/news_articles_schema.json b/example_outputs/generate_mock_data/news_articles/news_articles_schema.json index abba6991..371f7c38 100644 --- a/example_outputs/generate_mock_data/news_articles/news_articles_schema.json +++ b/example_outputs/generate_mock_data/news_articles/news_articles_schema.json @@ -1,6 +1,5 @@ { "$schema": "http://json-schema.org/draft/2020-12/schema", - "$id": "https://yourdomain.com/example.schema.json", "title": "News Articles", "description": "A schema to generate mock news article metadata", "type": "object", diff --git a/toolkit/extract_record_data/__init__.py b/toolkit/extract_record_data/__init__.py new file mode 100644 index 00000000..fe37edb5 --- /dev/null +++ b/toolkit/extract_record_data/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024 Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project. + +import os +from .api import ExtractRecordData +from toolkit.generate_mock_data.schema_builder import create_boilerplate_schema + +def get_readme(): + file_path = os.path.join(os.path.dirname(__file__), "README.md") + with open(file_path) as file: + return file.read() + +__all__ = [ + "ExtractRecordData", + "create_boilerplate_schema" +] \ No newline at end of file diff --git a/toolkit/extract_record_data/api.py b/toolkit/extract_record_data/api.py new file mode 100644 index 00000000..7429d72a --- /dev/null +++ b/toolkit/extract_record_data/api.py @@ -0,0 +1,51 @@ +import toolkit.extract_record_data.data_extractor as data_extractor +import toolkit.generate_mock_data.data_generator as data_generator +from toolkit.AI.openai_configuration import OpenAIConfiguration +import pandas as pd + +class ExtractRecordData: + def __init__(self): + self.json_schema = {} + self.record_arrays = [] + self.json_object = {} + self.array_dfs = {} + + def set_schema( + self, + json_schema: dict + ): + self.json_schema = json_schema + self.record_arrays: list[list[str]] = data_generator.extract_array_fields(json_schema) + + def set_ai_configuration( + self, + ai_configuration: OpenAIConfiguration + ): + self.ai_configuration = ai_configuration + + async def extract_record_data( + self, + input_texts: list[str], + generation_guidance: str="", + df_update_callback=None, + callback_batch=None + ): + """ + Extracts structured data records from input texts according to the JSON schema + + Args: + input_texts (list[str]): The list of input texts to extract data from + generation_guidance (str): Optional guidance to provide to the model + df_update_callback (function): A callback function to update the dataframe + callback_batch (function): A callback function to update the batch + """ + self.json_object, self.array_dfs = await data_extractor.extract_record_data( + ai_configuration=self.ai_configuration, + input_texts=input_texts, + data_schema=self.json_schema, + record_arrays=self.record_arrays, + generation_guidance=generation_guidance, + df_update_callback=df_update_callback, + callback_batch=callback_batch + ) + \ No newline at end of file diff --git a/toolkit/extract_record_data/data_extractor.py b/toolkit/extract_record_data/data_extractor.py index eb7bca3b..dafeab89 100644 --- a/toolkit/extract_record_data/data_extractor.py +++ b/toolkit/extract_record_data/data_extractor.py @@ -29,17 +29,16 @@ async def extract_record_data( ) for new_object in new_objects: - print(new_object) new_object_json = loads(new_object) generated_objects.append(new_object_json) current_object_json, conflicts = merge_json_objects(current_object_json, new_object_json) dfs = {} for record_array in record_arrays: df = extract_df(current_object_json, record_array) - dfs[record_array] = df + dfs[".".join(record_array)] = df if df_update_callback is not None: df_update_callback(dfs) - return current_object_json, generated_objects, dfs + return current_object_json, dfs async def _extract_data_parallel( @@ -73,11 +72,10 @@ async def _extract_data_parallel( ) def extract_df(json_data, record_path): - print(record_path) # Extracts a DataFrame from a JSON object return pd.json_normalize( data=json_data, - record_path=record_path.split('.') + record_path=record_path ) def merge_json_objects(json_obj1, json_obj2): diff --git a/toolkit/generate_mock_data/__init__.py b/toolkit/generate_mock_data/__init__.py index 1616ddc4..2a52a976 100644 --- a/toolkit/generate_mock_data/__init__.py +++ b/toolkit/generate_mock_data/__init__.py @@ -1,10 +1,16 @@ # Copyright (c) 2024 Microsoft Corporation. All rights reserved. # Licensed under the MIT license. See LICENSE file in the project. -# -import os +import os +from .api import GenerateMockData +from .schema_builder import create_boilerplate_schema def get_readme(): file_path = os.path.join(os.path.dirname(__file__), "README.md") with open(file_path) as file: - return file.read() \ No newline at end of file + return file.read() + +__all__ = [ + "GenerateMockData", + "create_boilerplate_schema" +] \ No newline at end of file diff --git a/toolkit/generate_mock_data/api.py b/toolkit/generate_mock_data/api.py new file mode 100644 index 00000000..d0a5ce3c --- /dev/null +++ b/toolkit/generate_mock_data/api.py @@ -0,0 +1,91 @@ +import toolkit.generate_mock_data.data_generator as data_generator +import toolkit.generate_mock_data.text_generator as text_generator +from toolkit.AI.openai_configuration import OpenAIConfiguration +import pandas as pd + +class GenerateMockData: + def __init__(self): + self.json_schema = {} + self.record_arrays = [] + self.json_object = {} + self.array_dfs = {} + + def set_schema( + self, + json_schema: dict + ): + self.json_schema = json_schema + self.record_arrays: list[list[str]] = data_generator.extract_array_fields(json_schema) + + def set_ai_configuration( + self, + ai_configuration: OpenAIConfiguration + ): + self.ai_configuration = ai_configuration + + async def generate_data_records( + self, + num_records_overall: int, + records_per_batch: int, + duplicate_records_per_batch: int, + related_records_per_batch: int, + generation_guidance: str="", + temperature: float=0.5, + df_update_callback=None, + callback_batch=None, + parallel_batches: int=0 + ): + """ + Generates structured data records according to the JSON schema + + Args: + num_records_overall (int): The total number of records to generate + records_per_batch (int): The number of records to generate per batch + duplicate_records_per_batch (int): The number of duplicate records to generate per batch + related_records_per_batch (int): The number of related records to generate per batch + generation_guidance (str): Optional guidance to provide to the model + temperature (float): The temperature to use when generating data + df_update_callback (function): A callback function to update the dataframe + callback_batch (function): A callback function to update the batch + parallel_batches (int): The number of parallel batches to generate + """ + self.json_object, self.array_dfs = await data_generator.generate_data( + ai_configuration=self.ai_configuration, + generation_guidance=generation_guidance, + data_schema=self.json_schema, + num_records_overall=num_records_overall, + records_per_batch=records_per_batch, + duplicate_records_per_batch=duplicate_records_per_batch, + related_records_per_batch=related_records_per_batch, + temperature=temperature, + df_update_callback=df_update_callback, + callback_batch=callback_batch, + parallel_batches=parallel_batches + ) + + async def generate_text_data( + self, + df: pd.DataFrame, + generation_guidance:str="", + temperature:float =0.5, + df_update_callback=None + ): + """ + Generates text data based on the input dataframe + + Args: + df (pandas.DataFrame): The input dataframe + generation_guidance (str): Optional guidance to provide to the model + temperature (float): The temperature to use when generating data + df_update_callback (function): A callback function to update the dataframe + """ + input_texts = [] + for _, row in df.iterrows(): + input_texts.append(row.to_json()) + self.text_list, self.text_df = await text_generator.generate_text_data( + ai_configuration=self.ai_configuration, + input_texts=input_texts, + generation_guidance=generation_guidance, + temperature=temperature, + df_update_callback=df_update_callback + ) \ No newline at end of file diff --git a/toolkit/generate_mock_data/data_generator.py b/toolkit/generate_mock_data/data_generator.py index 59455220..09d13990 100644 --- a/toolkit/generate_mock_data/data_generator.py +++ b/toolkit/generate_mock_data/data_generator.py @@ -14,19 +14,21 @@ async def generate_data( ai_configuration, generation_guidance, - primary_record_array, - record_arrays, data_schema, num_records_overall, records_per_batch, - parallel_batches, duplicate_records_per_batch, related_records_per_batch, temperature, df_update_callback, callback_batch, + parallel_batches=0 ): + if parallel_batches == 0: + parallel_batches = num_records_overall // records_per_batch num_iterations = num_records_overall // (records_per_batch * parallel_batches) + record_arrays = extract_array_fields(data_schema) + primary_record_array = record_arrays[0] generated_objects = [] first_object = generate_unseeded_data( ai_configuration=ai_configuration, @@ -41,7 +43,9 @@ async def generate_data( dfs = {} for i in range(num_iterations): if i == 0: - sample_records = sample_from_record_array(first_object_json, primary_record_array, records_per_batch) + sample_records = sample_from_record_array( + first_object_json, primary_record_array, records_per_batch + ) else: sample_records = sample_from_record_array( current_object_json, primary_record_array, parallel_batches @@ -67,10 +71,10 @@ async def generate_data( for record_array in record_arrays: df = extract_df(current_object_json, record_array) - dfs[record_array] = df + dfs[".".join(record_array)] = df if df_update_callback is not None: df_update_callback(dfs) - return current_object_json, generated_objects, dfs + return current_object_json, dfs def generate_unseeded_data( ai_configuration, @@ -155,11 +159,10 @@ def select_random_records(num_records, category_to_count): return category_to_ids def extract_df(json_data, record_path): - print(record_path) # Extracts a DataFrame from a JSON object return pd.json_normalize( data=json_data, - record_path=record_path.split('.') + record_path=record_path ) def merge_json_objects(json_obj1, json_obj2): @@ -193,7 +196,9 @@ def merge_values(key, value1, value2): return merged_object, conflicts -def extract_array_fields(schema): +def extract_array_fields( + schema: dict +) -> list[list[str]]: # Extracts any array fields at any level of nesting, and returns a list of lists of field names navigating down the schema array_fields = [] @@ -214,5 +219,5 @@ def extract_array_fields_recursive(schema, field_path): return array_fields def sample_from_record_array(current_object, record_array, k): - records = schema_builder.get_subobject(current_object, record_array.split(".")) + records = schema_builder.get_subobject(current_object, record_array) return random.sample(records, k) if len(records) > k else records \ No newline at end of file diff --git a/toolkit/generate_mock_data/schema_builder.py b/toolkit/generate_mock_data/schema_builder.py index 6b1d1fc3..599d7275 100644 --- a/toolkit/generate_mock_data/schema_builder.py +++ b/toolkit/generate_mock_data/schema_builder.py @@ -95,18 +95,28 @@ def get_required_list(json_obj, field_labels): def create_boilerplate_schema( schema_field="http://json-schema.org/draft/2020-12/schema", - id_field="https://yourdomain.com/example.schema.json", title_field="Example Schema", description_field="An example schema ready to be edited and populated with fields.", ): schema = { "$schema": schema_field, - "$id": id_field, "title": title_field, "description": description_field, "type": "object", - "properties": {}, - "required": [], + "properties": { + "records": { + "type": "array", + "description": "An array of records", + "items": { + "type": "object", + "description": "A record", + "properties": {}, + "required": [], + "additionalProperties": False + } + } + }, + "required": ["records"], "additionalProperties": False } return schema diff --git a/toolkit/generate_mock_data/text_generator.py b/toolkit/generate_mock_data/text_generator.py index 9fd4af4f..9490e9bb 100644 --- a/toolkit/generate_mock_data/text_generator.py +++ b/toolkit/generate_mock_data/text_generator.py @@ -1,62 +1,53 @@ # Copyright (c) 2024 Microsoft Corporation. All rights reserved. import pandas as pd - +import asyncio +from tqdm.asyncio import tqdm_asyncio import toolkit.AI.utils as utils import toolkit.generate_mock_data.prompts as prompts -from toolkit.helpers.progress_batch_callback import ProgressBatchCallback +from toolkit.AI.openai_configuration import OpenAIConfiguration async def generate_text_data( - ai_configuration, - generation_guidance, - input_texts, - temperature, - parallel_threads, - df_update_callback, - callback_batch, + ai_configuration: OpenAIConfiguration, + input_texts: list[str], + generation_guidance: str="", + temperature: float=0.5, + df_update_callback=None, ): - generated_texts = [] - - batched_texts = [input_texts[i:i + parallel_threads] for i in range(0, len(input_texts), parallel_threads)] - df = pd.DataFrame(columns=["mock_text"]) - - for batch in batched_texts: - new_texts = await _generate_text_parallel( + tasks = [] + for text in input_texts: + tasks.append(asyncio.create_task(_generate_text_async( ai_configuration=ai_configuration, - input_texts=batch, + input_text=text, generation_guidance=generation_guidance, - temperature=temperature, - callbacks=[callback_batch] if callback_batch is not None else None, - ) - generated_texts.extend(new_texts) - - df = pd.DataFrame(generated_texts, columns=["mock_text"]) - if df_update_callback is not None: - df_update_callback(df) + temperature=temperature + ))) + generated_texts = await tqdm_asyncio.gather(*tasks) + df = pd.DataFrame(generated_texts, columns=["mock_text"]) + if df_update_callback is not None: + df_update_callback(df) return generated_texts, df -async def _generate_text_parallel( +async def _generate_text_async( ai_configuration, - input_texts, + input_text, generation_guidance, - temperature, - callbacks: list[ProgressBatchCallback] | None = None, + temperature ): - mapped_messages = [utils.prepare_messages( + messages = utils.prepare_messages( prompts.text_generation_prompt, { 'input_text': input_text, 'generation_guidance': generation_guidance, - }) for input_text in input_texts - ] + } + ) - return await utils.map_generate_text( + return await utils.generate_text_async( ai_configuration, - mapped_messages, - temperature=temperature, - callbacks=callbacks, + messages, + temperature=temperature )