Merge branch 'api-refactor' of https://github.com/microsoft/intellige…

…nce-toolkit into api-refactor
microsoft · Oct 17, 2024 · 814cb6a · 814cb6a
2 parents 8fcf436 + ccc049b
commit 814cb6a
Show file tree

Hide file tree

Showing 15 changed files with 83 additions and 37 deletions.
diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -21,7 +21,7 @@
 
 #### Default values: 
 ```
-OPENAI_API_MODEL="gpt-4o-2024-08-06"
+OPENAI_API_MODEL="gpt-4o"
 OPENAI_TYPE="OpenAI"
 AZURE_AUTH_TYPE="Azure Key" # if OPENAI_TYPE==Azure OpenAI
 DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"

diff --git a/app/util/example_outputs_ui.py b/app/util/example_outputs_ui.py
@@ -21,7 +21,9 @@ def create_example_outputs_ui(container, workflow):
         example_json = loads(open(f"{workflow_home}/example_format.json", "r").read())
         order = example_json["example_order"]
         metadata = example_json["example_metadata"]
-        selected_data = st.selectbox("Select example", mock_data_folders)
+        selected_data = st.selectbox(
+            "Select example", mock_data_folders, disabled=len(mock_data_folders) <= 1
+        )
         if selected_data != None:
             headings = [metadata[x]["heading"] for x in order]
             tabs = st.tabs(headings)

diff --git a/app/util/ui_components.py b/app/util/ui_components.py
@@ -16,7 +16,6 @@
 from app.util.download_pdf import add_download_pdf
 from app.util.enums import Mode
 from app.util.openai_wrapper import UIOpenAIConfiguration
-from app.util.openai_wrapper import UIOpenAIConfiguration
 from toolkit.AI.classes import LLMCallback
 from toolkit.AI.client import OpenAIClient
 from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
@@ -746,3 +745,10 @@ def build_validation_ui(
                     file_name=f"{file_name}_{get_current_time()}_messages.json",
                     mime="text/json",
                 )
+
+def check_ai_configuration():
+    ai_configuration = UIOpenAIConfiguration().get_configuration()
+    if ai_configuration.api_key == "":
+        st.warning("Please set your OpenAI API key in the Settings page.")
+    if ai_configuration.model == "":
+        st.warning("Please set your OpenAI model in the Settings page.")
diff --git a/app/workflows/anonymize_case_data/README.md b/app/workflows/anonymize_case_data/README.md
@@ -93,7 +93,7 @@ Under `Quantize datetime attributes`, select the `quarter` attribute and `Half`
 
 #### Quantize numeric attributes
 
-Under `Quantize numeric attributes`, select `age`, set `Target bins` to `5`, and leave `Trim percent` at `0.00`. After pressing `Quantize selected columns`, the `Prepared data` view of the loaded data will show `age` now encoded into five age ranges represented as (exclusive mininum value-inclusive maximum value]:
+Under `Quantize numeric attributes`, select `age`, leave `Target bins` to `5`, and leave `Trim percent` at `0.00`. After pressing `Quantize selected columns`, the `Prepared data` view of the loaded data will show `age` now encoded into five age ranges represented as (exclusive mininum value-inclusive maximum value]:
 
 - `(0-20]`
 - `(20-40]`
@@ -122,8 +122,6 @@ Rename `age` to `age_range` and `quarter` to `period`.
 
 #### Evaluating synthesizability
 
-Pressing `Generate final dataset` applies all the specified transformations to the input dataset, with the result available for viewing and download under the `Final` tab of the data table panel.
-
 The `Synthesizability summary` gives an initial indication of how easy it will be to generate high-accurary synthetic data given the number of attribute combinations in the final sensitive dataset. The smaller each of these numbers, the better:
 
 - `Number of selected columns`: The number of columns after all data transformations
@@ -175,7 +173,7 @@ Move to the `Query and visualize data` tab to start exploring the anonymous data
 
 Any record counts shown will use the corresponding protected counts from the aggregate data if these counts exist, since they will always be the most accurate, otherwise the synthetic data will be dynamically filtered to derive the desired count.
 
-Before any filters are applied, the interface will use the `record_count` value from thr aggregate data as the estimate of sensitive records overall.
+Before any filters are applied, the interface will use the `record_count` value from the aggregate data as the estimate of sensitive records overall.
 
 Try adding one or more attribute values to the query to observe the estimated count change. Notice that:
 
@@ -192,7 +190,7 @@ This chart groups values by attribute and shows these groups in descending count
 
 The chart can be configured in multiple ways:
 
-- Set `Subject label` to `Customer` to indicate what kind of individul is being counted
+- Set `Subject label` to `Customer` to indicate what kind of individual is being counted
 - Set `Types of  top attributes to show` to filter to a particular attribute, e.g., `product_code`
 - Set `Number of top attribute values to show` to `5` to show only the top five attribute values
 

diff --git a/app/workflows/anonymize_case_data/workflow.py b/app/workflows/anonymize_case_data/workflow.py
@@ -9,12 +9,12 @@
 import plotly.io as pio
 import streamlit as st
 
+import app.util.example_outputs_ui as example_outputs_ui
 import app.util.ui_components as ui_components
 import app.workflows.anonymize_case_data.config as config
-import toolkit.anonymize_case_data.visuals as visuals
-import toolkit.anonymize_case_data.queries as queries
 import app.workflows.anonymize_case_data.variables as ds_variables
-import app.util.example_outputs_ui as example_outputs_ui
+import toolkit.anonymize_case_data.queries as queries
+import toolkit.anonymize_case_data.visuals as visuals
 from toolkit.anonymize_case_data import AnonymizeCaseData, color_schemes
 
 

diff --git a/app/workflows/compare_case_groups/workflow.py b/app/workflows/compare_case_groups/workflow.py
@@ -6,16 +6,20 @@
 import polars as pl
 import streamlit as st
 
-import app.workflows.compare_case_groups.variables as gn_variables
 import app.util.example_outputs_ui as example_outputs_ui
+import app.workflows.compare_case_groups.variables as gn_variables
 from app.util import ui_components
 from toolkit.compare_case_groups import prompts
-from toolkit.compare_case_groups.build_dataframes import (build_attribute_df,
-                                                          build_grouped_df,
-                                                          build_ranked_df,
-                                                          filter_df)
-from toolkit.compare_case_groups.temporal_process import (build_temporal_data,
-                                                          create_window_df)
+from toolkit.compare_case_groups.build_dataframes import (
+    build_attribute_df,
+    build_grouped_df,
+    build_ranked_df,
+    filter_df,
+)
+from toolkit.compare_case_groups.temporal_process import (
+    build_temporal_data,
+    create_window_df,
+)
 from toolkit.helpers.df_functions import fix_null_ints
 
 
@@ -26,6 +30,8 @@ def get_intro() -> str:
 
 
 def create(sv: gn_variables.SessionVariables, workflow=None):
+    ui_components.check_ai_configuration()
+
     intro_tab, prepare_tab, summarize_tab, generate_tab, examples_tab = st.tabs(
         [
             "Compare case groups workflow:",

diff --git a/app/workflows/detect_case_patterns/workflow.py b/app/workflows/detect_case_patterns/workflow.py
@@ -17,8 +17,8 @@
     GridUpdateMode,
 )
 
-import app.workflows.detect_case_patterns.variables as ap_variables
 import app.util.example_outputs_ui as example_outputs_ui
+import app.workflows.detect_case_patterns.variables as ap_variables
 from app.util import ui_components
 from toolkit.AI.classes import LLMCallback
 from toolkit.detect_case_patterns import prompts
@@ -50,6 +50,8 @@ def get_intro():
 
 
 def create(sv: ap_variables.SessionVariables, workflow):
+    ui_components.check_ai_configuration()
+
     intro_tab, uploader_tab, detect_tab, explain_tab, examples_tab = st.tabs(
         [
             "Detect Case Patterns workflow:",

diff --git a/app/workflows/detect_entity_networks/workflow.py b/app/workflows/detect_entity_networks/workflow.py
@@ -51,6 +51,7 @@ def get_intro():
 
 async def create(sv: rn_variables.SessionVariables, workflow=None):
     sv_home = SessionVariables("home")
+    ui_components.check_ai_configuration()
 
     intro_tab, uploader_tab, process_tab, view_tab, report_tab, examples_tab = st.tabs(
         [

diff --git a/app/workflows/extract_record_data/workflow.py b/app/workflows/extract_record_data/workflow.py
@@ -4,13 +4,12 @@
 import pandas as pd
 import streamlit as st
 
+import app.util.example_outputs_ui as example_outputs_ui
+import app.util.schema_ui as schema_ui
+import app.util.ui_components as ui_components
 import app.workflows.extract_record_data.variables as variables
-import toolkit.extract_record_data.prompts as prompts
 import toolkit.extract_record_data.data_extractor as data_extractor
-import app.util.schema_ui as schema_ui
 from app.util.openai_wrapper import UIOpenAIConfiguration
-import app.util.ui_components as ui_components
-import app.util.example_outputs_ui as example_outputs_ui
 
 ai_configuration = UIOpenAIConfiguration().get_configuration()
 
@@ -21,6 +20,8 @@ def get_intro():
 
 
 async def create(sv: variables.SessionVariables, workflow: None):
+    ui_components.check_ai_configuration()
+
     intro_tab, schema_tab, generator_tab, mock_tab = st.tabs(['Extract Record Data workflow:', 'Prepare data schema', 'Extract structured records', 'View example outputs'])
     with intro_tab:
         st.markdown(get_intro())

diff --git a/app/workflows/generate_mock_data/workflow.py b/app/workflows/generate_mock_data/workflow.py
@@ -4,13 +4,13 @@
 import pandas as pd
 import streamlit as st
 
+import app.util.example_outputs_ui as example_outputs_ui
+import app.util.schema_ui as schema_ui
+import app.util.ui_components as ui_components
 import app.workflows.generate_mock_data.variables as bds_variables
 import toolkit.generate_mock_data.data_generator as data_generator
 import toolkit.generate_mock_data.text_generator as text_generator
-import app.util.schema_ui as schema_ui
 from app.util.openai_wrapper import UIOpenAIConfiguration
-import app.util.ui_components as ui_components
-import app.util.example_outputs_ui as example_outputs_ui
 
 ai_configuration = UIOpenAIConfiguration().get_configuration()
 
@@ -21,6 +21,8 @@ def get_intro():
 
 
 async def create(sv: bds_variables.SessionVariables, workflow: None):
+    ui_components.check_ai_configuration()
+
     intro_tab, schema_tab, record_generator_tab, text_generator_tab, mock_tab = st.tabs(['Generate Mock Data workflow:', 'Prepare data schema', 'Generate mock records', 'Generate mock texts', 'View example outputs'])
     with intro_tab:
         st.markdown(get_intro())

diff --git a/app/workflows/match_entity_records/workflow.py b/app/workflows/match_entity_records/workflow.py
@@ -41,6 +41,7 @@ def get_intro():
 
 async def create(sv: rm_variables.SessionVariable, workflow=None) -> None:
     sv_home = home_vars.SessionVariables("home")
+    ui_components.check_ai_configuration()
 
     intro_tab, uploader_tab, process_tab, evaluate_tab, examples_tab = st.tabs(
         [

diff --git a/app/workflows/query_text_data/functions.py b/app/workflows/query_text_data/functions.py
@@ -1,12 +1,15 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license. See LICENSE file in the project.
 #
+from collections import defaultdict
+
 import streamlit as st
 from seaborn import color_palette
-from collections import defaultdict
 from streamlit_agraph import Config, Edge, Node, agraph
+
 from toolkit.helpers.progress_batch_callback import ProgressBatchCallback
 
+
 def create_progress_callback(template: str):
     pb = st.progress(0, "Preparing...")
 
@@ -82,6 +85,7 @@ def get_concept_graph(
         hierarchical=False,
         key=key,
         linkLength=100,
+        timestep=0.1,
     )
     with placeholder:
         return_value = agraph(nodes=nodes, edges=edges, config=config)

diff --git a/app/workflows/query_text_data/workflow.py b/app/workflows/query_text_data/workflow.py
@@ -7,15 +7,10 @@
 import streamlit as st
 import string
 
-
+import app.util.embedder as embedder
 import app.util.example_outputs_ui as example_outputs_ui
 import app.workflows.query_text_data.functions as functions
-import app.util.embedder as embedder
-from toolkit.query_text_data.classes import ProcessedChunks, ChunkSearchConfig, AnswerConfig, AnswerObject
-from toolkit.query_text_data.input_processor import PeriodOption
-from toolkit.query_text_data.api import QueryTextData, QueryTextDataStage
 import toolkit.query_text_data.prompts as prompts
-
 from app.util import ui_components
 from app.util.download_pdf import add_download_pdf
 from app.util.openai_wrapper import UIOpenAIConfiguration
@@ -25,6 +20,14 @@
     create_concept_to_community_hierarchy,
     generate_graph_fusion_encoder_embedding,
 )
+from toolkit.query_text_data.api import QueryTextData, QueryTextDataStage
+from toolkit.query_text_data.classes import (
+    AnswerConfig,
+    AnswerObject,
+    ChunkSearchConfig,
+    ProcessedChunks,
+)
+from toolkit.query_text_data.input_processor import PeriodOption
 from toolkit.query_text_data.pattern_detector import (
     combine_chunk_text_and_explantion,
     detect_converging_pairs,
@@ -41,6 +44,8 @@ def get_intro():
 
 async def create(sv: SessionVariables, workflow=None):
     sv_home = SessionVariables("home")
+    ui_components.check_ai_configuration()
+
     qtd = sv.workflow_object.value
     qtd.set_ai_config(ai_configuration, sv_home.save_cache.value)
     intro_tab, uploader_tab, graph_tab, search_tab, report_tab, examples_tab = st.tabs(

diff --git a/toolkit/AI/defaults.py b/toolkit/AI/defaults.py
@@ -4,7 +4,7 @@
 
 DEFAULT_ENCODING = "cl100k_base"
 #
-DEFAULT_LLM_MODEL = "gpt-4o-2024-08-06"
+DEFAULT_LLM_MODEL = "gpt-4o"
 DEFAULT_LLM_MAX_TOKENS = 4000
 DEFAULT_AZ_AUTH_TYPE = "Azure Key"
 EMBEDDING_BATCHES_NUMBER = 600

diff --git a/toolkit/query_text_data/helper_functions.py b/toolkit/query_text_data/helper_functions.py
@@ -5,6 +5,7 @@
 from toolkit.AI.classes import VectorData
 from toolkit.AI.utils import hash_text
 
+
 def get_adjacent_chunks(source, previous_chunk_dict, next_chunk_dict, steps):
     prev_chunks = []
     current_chunk = source
@@ -99,9 +100,26 @@ async def embed_queries(
 
     embedded_data = await text_embedder.embed_store_many(data, callbacks, cache_data)
     for item in embedded_data:
+        # find item in data
+        data_item = next((x for x in data if x["hash"] == item["hash"]), None)
+
+        if data_item is None:
+            print(f"No matching data item for {item}")
+            continue
+
         details = json.loads(item["additional_details"])
-        if len(details.keys()) == 0:
-            print(f"No details for {item}")
+        additional_details = data_item["additional_details"]
+
+        if isinstance(additional_details, str):
+            additional_details = json.loads(additional_details)
+
+        qid = additional_details.get("qid")
+        if qid is None:
+            print(f"No qid found in additional details for {item}")
             continue
-        qid_to_vector[details["qid"]] = item["vector"]
+
+        if details.get("qid") != qid:
+            details = {"qid": qid}
+
+        qid_to_vector[qid] = item["vector"]
     return qid_to_vector