Skip to content

Commit

Permalink
Merge branch 'api-refactor' of https://github.com/microsoft/intellige…
Browse files Browse the repository at this point in the history
…nce-toolkit into api-refactor
  • Loading branch information
Darren Edge committed Oct 17, 2024
2 parents 8fcf436 + ccc049b commit 814cb6a
Show file tree
Hide file tree
Showing 15 changed files with 83 additions and 37 deletions.
2 changes: 1 addition & 1 deletion DEVELOPING.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#### Default values:
```
OPENAI_API_MODEL="gpt-4o-2024-08-06"
OPENAI_API_MODEL="gpt-4o"
OPENAI_TYPE="OpenAI"
AZURE_AUTH_TYPE="Azure Key" # if OPENAI_TYPE==Azure OpenAI
DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"
Expand Down
4 changes: 3 additions & 1 deletion app/util/example_outputs_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ def create_example_outputs_ui(container, workflow):
example_json = loads(open(f"{workflow_home}/example_format.json", "r").read())
order = example_json["example_order"]
metadata = example_json["example_metadata"]
selected_data = st.selectbox("Select example", mock_data_folders)
selected_data = st.selectbox(
"Select example", mock_data_folders, disabled=len(mock_data_folders) <= 1
)
if selected_data != None:
headings = [metadata[x]["heading"] for x in order]
tabs = st.tabs(headings)
Expand Down
8 changes: 7 additions & 1 deletion app/util/ui_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from app.util.download_pdf import add_download_pdf
from app.util.enums import Mode
from app.util.openai_wrapper import UIOpenAIConfiguration
from app.util.openai_wrapper import UIOpenAIConfiguration
from toolkit.AI.classes import LLMCallback
from toolkit.AI.client import OpenAIClient
from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
Expand Down Expand Up @@ -746,3 +745,10 @@ def build_validation_ui(
file_name=f"{file_name}_{get_current_time()}_messages.json",
mime="text/json",
)

def check_ai_configuration():
ai_configuration = UIOpenAIConfiguration().get_configuration()
if ai_configuration.api_key == "":
st.warning("Please set your OpenAI API key in the Settings page.")
if ai_configuration.model == "":
st.warning("Please set your OpenAI model in the Settings page.")
8 changes: 3 additions & 5 deletions app/workflows/anonymize_case_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Under `Quantize datetime attributes`, select the `quarter` attribute and `Half`

#### Quantize numeric attributes

Under `Quantize numeric attributes`, select `age`, set `Target bins` to `5`, and leave `Trim percent` at `0.00`. After pressing `Quantize selected columns`, the `Prepared data` view of the loaded data will show `age` now encoded into five age ranges represented as (exclusive mininum value-inclusive maximum value]:
Under `Quantize numeric attributes`, select `age`, leave `Target bins` to `5`, and leave `Trim percent` at `0.00`. After pressing `Quantize selected columns`, the `Prepared data` view of the loaded data will show `age` now encoded into five age ranges represented as (exclusive mininum value-inclusive maximum value]:

- `(0-20]`
- `(20-40]`
Expand Down Expand Up @@ -122,8 +122,6 @@ Rename `age` to `age_range` and `quarter` to `period`.

#### Evaluating synthesizability

Pressing `Generate final dataset` applies all the specified transformations to the input dataset, with the result available for viewing and download under the `Final` tab of the data table panel.

The `Synthesizability summary` gives an initial indication of how easy it will be to generate high-accurary synthetic data given the number of attribute combinations in the final sensitive dataset. The smaller each of these numbers, the better:

- `Number of selected columns`: The number of columns after all data transformations
Expand Down Expand Up @@ -175,7 +173,7 @@ Move to the `Query and visualize data` tab to start exploring the anonymous data

Any record counts shown will use the corresponding protected counts from the aggregate data if these counts exist, since they will always be the most accurate, otherwise the synthetic data will be dynamically filtered to derive the desired count.

Before any filters are applied, the interface will use the `record_count` value from thr aggregate data as the estimate of sensitive records overall.
Before any filters are applied, the interface will use the `record_count` value from the aggregate data as the estimate of sensitive records overall.

Try adding one or more attribute values to the query to observe the estimated count change. Notice that:

Expand All @@ -192,7 +190,7 @@ This chart groups values by attribute and shows these groups in descending count

The chart can be configured in multiple ways:

- Set `Subject label` to `Customer` to indicate what kind of individul is being counted
- Set `Subject label` to `Customer` to indicate what kind of individual is being counted
- Set `Types of top attributes to show` to filter to a particular attribute, e.g., `product_code`
- Set `Number of top attribute values to show` to `5` to show only the top five attribute values

Expand Down
6 changes: 3 additions & 3 deletions app/workflows/anonymize_case_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
import plotly.io as pio
import streamlit as st

import app.util.example_outputs_ui as example_outputs_ui
import app.util.ui_components as ui_components
import app.workflows.anonymize_case_data.config as config
import toolkit.anonymize_case_data.visuals as visuals
import toolkit.anonymize_case_data.queries as queries
import app.workflows.anonymize_case_data.variables as ds_variables
import app.util.example_outputs_ui as example_outputs_ui
import toolkit.anonymize_case_data.queries as queries
import toolkit.anonymize_case_data.visuals as visuals
from toolkit.anonymize_case_data import AnonymizeCaseData, color_schemes


Expand Down
20 changes: 13 additions & 7 deletions app/workflows/compare_case_groups/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,20 @@
import polars as pl
import streamlit as st

import app.workflows.compare_case_groups.variables as gn_variables
import app.util.example_outputs_ui as example_outputs_ui
import app.workflows.compare_case_groups.variables as gn_variables
from app.util import ui_components
from toolkit.compare_case_groups import prompts
from toolkit.compare_case_groups.build_dataframes import (build_attribute_df,
build_grouped_df,
build_ranked_df,
filter_df)
from toolkit.compare_case_groups.temporal_process import (build_temporal_data,
create_window_df)
from toolkit.compare_case_groups.build_dataframes import (
build_attribute_df,
build_grouped_df,
build_ranked_df,
filter_df,
)
from toolkit.compare_case_groups.temporal_process import (
build_temporal_data,
create_window_df,
)
from toolkit.helpers.df_functions import fix_null_ints


Expand All @@ -26,6 +30,8 @@ def get_intro() -> str:


def create(sv: gn_variables.SessionVariables, workflow=None):
ui_components.check_ai_configuration()

intro_tab, prepare_tab, summarize_tab, generate_tab, examples_tab = st.tabs(
[
"Compare case groups workflow:",
Expand Down
4 changes: 3 additions & 1 deletion app/workflows/detect_case_patterns/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
GridUpdateMode,
)

import app.workflows.detect_case_patterns.variables as ap_variables
import app.util.example_outputs_ui as example_outputs_ui
import app.workflows.detect_case_patterns.variables as ap_variables
from app.util import ui_components
from toolkit.AI.classes import LLMCallback
from toolkit.detect_case_patterns import prompts
Expand Down Expand Up @@ -50,6 +50,8 @@ def get_intro():


def create(sv: ap_variables.SessionVariables, workflow):
ui_components.check_ai_configuration()

intro_tab, uploader_tab, detect_tab, explain_tab, examples_tab = st.tabs(
[
"Detect Case Patterns workflow:",
Expand Down
1 change: 1 addition & 0 deletions app/workflows/detect_entity_networks/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_intro():

async def create(sv: rn_variables.SessionVariables, workflow=None):
sv_home = SessionVariables("home")
ui_components.check_ai_configuration()

intro_tab, uploader_tab, process_tab, view_tab, report_tab, examples_tab = st.tabs(
[
Expand Down
9 changes: 5 additions & 4 deletions app/workflows/extract_record_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
import pandas as pd
import streamlit as st

import app.util.example_outputs_ui as example_outputs_ui
import app.util.schema_ui as schema_ui
import app.util.ui_components as ui_components
import app.workflows.extract_record_data.variables as variables
import toolkit.extract_record_data.prompts as prompts
import toolkit.extract_record_data.data_extractor as data_extractor
import app.util.schema_ui as schema_ui
from app.util.openai_wrapper import UIOpenAIConfiguration
import app.util.ui_components as ui_components
import app.util.example_outputs_ui as example_outputs_ui

ai_configuration = UIOpenAIConfiguration().get_configuration()

Expand All @@ -21,6 +20,8 @@ def get_intro():


async def create(sv: variables.SessionVariables, workflow: None):
ui_components.check_ai_configuration()

intro_tab, schema_tab, generator_tab, mock_tab = st.tabs(['Extract Record Data workflow:', 'Prepare data schema', 'Extract structured records', 'View example outputs'])
with intro_tab:
st.markdown(get_intro())
Expand Down
8 changes: 5 additions & 3 deletions app/workflows/generate_mock_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import pandas as pd
import streamlit as st

import app.util.example_outputs_ui as example_outputs_ui
import app.util.schema_ui as schema_ui
import app.util.ui_components as ui_components
import app.workflows.generate_mock_data.variables as bds_variables
import toolkit.generate_mock_data.data_generator as data_generator
import toolkit.generate_mock_data.text_generator as text_generator
import app.util.schema_ui as schema_ui
from app.util.openai_wrapper import UIOpenAIConfiguration
import app.util.ui_components as ui_components
import app.util.example_outputs_ui as example_outputs_ui

ai_configuration = UIOpenAIConfiguration().get_configuration()

Expand All @@ -21,6 +21,8 @@ def get_intro():


async def create(sv: bds_variables.SessionVariables, workflow: None):
ui_components.check_ai_configuration()

intro_tab, schema_tab, record_generator_tab, text_generator_tab, mock_tab = st.tabs(['Generate Mock Data workflow:', 'Prepare data schema', 'Generate mock records', 'Generate mock texts', 'View example outputs'])
with intro_tab:
st.markdown(get_intro())
Expand Down
1 change: 1 addition & 0 deletions app/workflows/match_entity_records/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def get_intro():

async def create(sv: rm_variables.SessionVariable, workflow=None) -> None:
sv_home = home_vars.SessionVariables("home")
ui_components.check_ai_configuration()

intro_tab, uploader_tab, process_tab, evaluate_tab, examples_tab = st.tabs(
[
Expand Down
6 changes: 5 additions & 1 deletion app/workflows/query_text_data/functions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.
#
from collections import defaultdict

import streamlit as st
from seaborn import color_palette
from collections import defaultdict
from streamlit_agraph import Config, Edge, Node, agraph

from toolkit.helpers.progress_batch_callback import ProgressBatchCallback


def create_progress_callback(template: str):
pb = st.progress(0, "Preparing...")

Expand Down Expand Up @@ -82,6 +85,7 @@ def get_concept_graph(
hierarchical=False,
key=key,
linkLength=100,
timestep=0.1,
)
with placeholder:
return_value = agraph(nodes=nodes, edges=edges, config=config)
Expand Down
17 changes: 11 additions & 6 deletions app/workflows/query_text_data/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,10 @@
import streamlit as st
import string


import app.util.embedder as embedder
import app.util.example_outputs_ui as example_outputs_ui
import app.workflows.query_text_data.functions as functions
import app.util.embedder as embedder
from toolkit.query_text_data.classes import ProcessedChunks, ChunkSearchConfig, AnswerConfig, AnswerObject
from toolkit.query_text_data.input_processor import PeriodOption
from toolkit.query_text_data.api import QueryTextData, QueryTextDataStage
import toolkit.query_text_data.prompts as prompts

from app.util import ui_components
from app.util.download_pdf import add_download_pdf
from app.util.openai_wrapper import UIOpenAIConfiguration
Expand All @@ -25,6 +20,14 @@
create_concept_to_community_hierarchy,
generate_graph_fusion_encoder_embedding,
)
from toolkit.query_text_data.api import QueryTextData, QueryTextDataStage
from toolkit.query_text_data.classes import (
AnswerConfig,
AnswerObject,
ChunkSearchConfig,
ProcessedChunks,
)
from toolkit.query_text_data.input_processor import PeriodOption
from toolkit.query_text_data.pattern_detector import (
combine_chunk_text_and_explantion,
detect_converging_pairs,
Expand All @@ -41,6 +44,8 @@ def get_intro():

async def create(sv: SessionVariables, workflow=None):
sv_home = SessionVariables("home")
ui_components.check_ai_configuration()

qtd = sv.workflow_object.value
qtd.set_ai_config(ai_configuration, sv_home.save_cache.value)
intro_tab, uploader_tab, graph_tab, search_tab, report_tab, examples_tab = st.tabs(
Expand Down
2 changes: 1 addition & 1 deletion toolkit/AI/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

DEFAULT_ENCODING = "cl100k_base"
#
DEFAULT_LLM_MODEL = "gpt-4o-2024-08-06"
DEFAULT_LLM_MODEL = "gpt-4o"
DEFAULT_LLM_MAX_TOKENS = 4000
DEFAULT_AZ_AUTH_TYPE = "Azure Key"
EMBEDDING_BATCHES_NUMBER = 600
Expand Down
24 changes: 21 additions & 3 deletions toolkit/query_text_data/helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from toolkit.AI.classes import VectorData
from toolkit.AI.utils import hash_text


def get_adjacent_chunks(source, previous_chunk_dict, next_chunk_dict, steps):
prev_chunks = []
current_chunk = source
Expand Down Expand Up @@ -99,9 +100,26 @@ async def embed_queries(

embedded_data = await text_embedder.embed_store_many(data, callbacks, cache_data)
for item in embedded_data:
# find item in data
data_item = next((x for x in data if x["hash"] == item["hash"]), None)

if data_item is None:
print(f"No matching data item for {item}")
continue

details = json.loads(item["additional_details"])
if len(details.keys()) == 0:
print(f"No details for {item}")
additional_details = data_item["additional_details"]

if isinstance(additional_details, str):
additional_details = json.loads(additional_details)

qid = additional_details.get("qid")
if qid is None:
print(f"No qid found in additional details for {item}")
continue
qid_to_vector[details["qid"]] = item["vector"]

if details.get("qid") != qid:
details = {"qid": qid}

qid_to_vector[qid] = item["vector"]
return qid_to_vector

0 comments on commit 814cb6a

Please sign in to comment.