separate risk networks logic (#26)

* created risk_networks folder * separate logic for happy path for Entity-Attribute * happy path even on UI ok * risk networks entity attributes create data model notebook * add node indexer * add type * add infer and index nodes * add logic in rn and tests * change python folder to toolkit * change folder name on code to import from toolkit folder * fix test * add max cluster size * noqa on test removed * fix max network size * add detect notebook * add flags rendering, logic and tests * naming refactoring * fix loading and connection bar to openai * code fixes and separation * add fixed notebooks * add index infer notebook * fix tests and inferring * fixes and separation * fix local embed * fixed on tests * push_before_class * constant fixes * import fixes and notebooks * change main notebook * delete report * fixed after PR and build * fix developing md * fix installer
microsoft · Aug 23, 2024 · aef6e9f · aef6e9f
1 parent b60bb17
commit aef6e9f
Show file tree

Hide file tree

Showing 102 changed files with 6,091 additions and 1,956 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ __pycache__/
 # Virtual environment
 venv/
 env/
+*.lcov
 
 # Streamlit
 .cache/
@@ -52,6 +53,7 @@ app/wkhtmltox/*.exe
 *.coverage
 **/messages
 **/xpia*
+*.lcov
 
 **/studio_tests/
 .venv
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
@@ -63,7 +63,7 @@ stages:
             inputs:
               workingDirectory: ./
               targetType: "inline"
-              script: pytest
+              script: pytest -vv
       - job: buildAndPush
         displayName: BuildAndPushContainer
         dependsOn: validate

diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -58,7 +58,7 @@ Open venv/Scripts/Activate.ps1, add the following lines after line 167:
     $env:AZURE_OPENAI_ENDPOINT="https://<ENDPOINT>.openai.azure.com/"
 ``` 
 ## Running code-only 
-- [Attribute Patterns](./python/attribute_patterns/README.md)
+- [Attribute Patterns](./toolkit/attribute_patterns/README.md)
 
     - [Example](./examples/attribute_patterns.ipynb): See an example of how to run the code with your data to obtain results without the need to run the UI.
 

diff --git a/app/pages/Settings.py b/app/pages/Settings.py
@@ -20,8 +20,8 @@
 )
 from util.secrets_handler import SecretsHandler
 
-from python.AI.vector_store import VectorStore
-from python.helpers.constants import CACHE_PATH
+from toolkit.AI.vector_store import VectorStore
+from toolkit.helpers.constants import CACHE_PATH
 
 
 def on_change(handler, key=None, value=None):

diff --git a/app/util/openai_wrapper.py b/app/util/openai_wrapper.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license. See LICENSE file in the project.
 #
-from python.AI.openai_configuration import OpenAIConfiguration
+from toolkit.AI.openai_configuration import OpenAIConfiguration
 
 from .secrets_handler import SecretsHandler
 

diff --git a/app/util/ui_components.py b/app/util/ui_components.py
@@ -15,10 +15,10 @@
 from app.util.enums import Mode
 from app.util.openai_wrapper import UIOpenAIConfiguration
 
-import python.AI.utils as utils
-from python.AI.classes import LLMCallback
-from python.AI.client import OpenAIClient
-from python.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
+import toolkit.AI.utils as utils
+from toolkit.AI.classes import LLMCallback
+from toolkit.AI.client import OpenAIClient
+from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
 
 
 def return_token_count(text: str) -> int:
@@ -145,11 +145,13 @@ def generative_batch_ai_component(
     batch_count = batch_count_raw + 1 if batch_count_remaining != 0 else batch_count_raw
     batch_messages = []
 
-    full_prompt = " ".join([
-        system_prompt_var.value["report_prompt"],
-        instructions_text,
-        system_prompt_var.value["safety_prompt"],
-    ])
+    full_prompt = " ".join(
+        [
+            system_prompt_var.value["report_prompt"],
+            instructions_text,
+            system_prompt_var.value["safety_prompt"],
+        ]
+    )
     for _i in range(batch_count):
         batch = batch_val[batch_offset : min(batch_offset + batch_size, len(batch_val))]
         batch_offset += batch_size
@@ -685,17 +687,21 @@ def prepare_input_df(
                     for _i, row in melted.iterrows():
                         if row["Attribute"] in expanded_atts:
                             if str(row["Value"]) not in ["", "<NA>"]:
-                                new_rows.append([
-                                    row["Subject ID"],
-                                    row["Attribute"] + "_" + str(row["Value"]),
-                                    "1",
-                                ])
+                                new_rows.append(
+                                    [
+                                        row["Subject ID"],
+                                        row["Attribute"] + "_" + str(row["Value"]),
+                                        "1",
+                                    ]
+                                )
                         else:
-                            new_rows.append([
-                                row["Subject ID"],
-                                row["Attribute"],
-                                str(row["Value"]),
-                            ])
+                            new_rows.append(
+                                [
+                                    row["Subject ID"],
+                                    row["Attribute"],
+                                    str(row["Value"]),
+                                ]
+                            )
                     melted = pd.DataFrame(
                         new_rows, columns=["Subject ID", "Attribute", "Value"]
                     )
@@ -750,6 +756,15 @@ def on(text):
     return on_callback
 
 
+def remove_connection_bar(fn):
+    def on(_):
+        fn(_)
+
+    on_callback = LLMCallback()
+    on_callback.on_llm_new_token = on
+    return on_callback
+
+
 def build_validation_ui(
     report_validation, attribute_report_validation_messages, report_data, file_name
 ):

diff --git a/app/workflows/attribute_patterns/README.md b/app/workflows/attribute_patterns/README.md
@@ -1,21 +1,3 @@
 # Attribute Patterns
 
-The **Attribute Patterns** workflow generates intelligence reports on attribute patterns detected in streams of case records.
-
-## How it works
-
-1. [**Input**] Case records representing categorical attributes of data subjects observed at a point time. Units are treated as anonymous and independent.
-2. [**Process**] Categorical attributes are modelled as a dynamic graph, where nodes represent attribute values in a given time window and edges represent the co-occurrences of attribute values.
-3. [**Process**] A technique called [Graph Fusion Encoder Embedding](https://arxiv.org/abs/2303.18051) is used to embed the dynamic attribute graph into a multi-dimensional space.
-4. [**Process**] Within each time period, attribute patterns are detected as combinations of attributes all moving towards one another in the embedding space.
-5. [**Output**] Attribute patterns CSV file. Can be created and used independently without any AI or embedding calls.
-6. [**AI Calls**] For patterns of interest selected by the user, generative AI is used to create AI pattern reports.
-7. [**Output**] AI pattern report MD/PDF file(s) describing the nature of the pattern, its progression over time, top co-occurring attribute values, possible explanations, and suggested actions.
-
-## Input requirements
-
-- The input data file should be in CSV format and represent individual data subjects.
-- Individual data subjects may be represented by a single row, in which case no identifier is required, or by multiple rows, in which case an identifier is required to link these rows into a single record.
-- For attribute pattern detection, each individual must be represented as a collection of discrete (i.e., categorical or binary) attributes. Any continuous attributes must first be quantized via the user interface.
-- Given the goal of identifying attribute patterns, no direct identifiers (e.g., names, aliases, ids, phone numbers, email addresses, street addresses) should be included in data outputs. Following the principle of [data minimization](https://en.wikipedia.org/wiki/Data_minimization), such direct identifiers should be removed from data inputs because they are not required for the processing purpose and create unnecessary risks for the data subject. Tools such as Microsoft Excel can be used to delete any direct identifier columns prior to use in Intelligence Toolkit.
-- First converting any sensitive input dataset to a synthetic dataset using the Data Synthesis workflow will ensure that any detected attribute patterns can be safely shared without compromising the privacy of data subjects.
+[Go to main code folder](../../../toolkit/attribute_patterns/README.md)
diff --git a/app/workflows/attribute_patterns/variables.py b/app/workflows/attribute_patterns/variables.py
@@ -7,7 +7,7 @@
 import streamlit as st
 from app.util.session_variable import SessionVariable
 
-import python.attribute_patterns.prompts as prompts
+import toolkit.attribute_patterns.prompts as prompts
 
 
 class SessionVariables:

diff --git a/app/workflows/attribute_patterns/workflow.py b/app/workflows/attribute_patterns/workflow.py
@@ -1,8 +1,6 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license. See LICENSE file in the project.
 #
-import os
-
 import altair as alt
 import streamlit as st
 import app.workflows.attribute_patterns.variables as ap_variables
@@ -15,31 +13,28 @@
 )
 from app.util import ui_components
 
-from python.attribute_patterns import prompts
-from python.attribute_patterns.embedding import generate_embedding
-from python.attribute_patterns.model import (
+from toolkit.attribute_patterns import get_readme as get_intro
+from toolkit.attribute_patterns import prompts
+from toolkit.attribute_patterns.embedding import generate_embedding
+from toolkit.attribute_patterns.model import (
     compute_attribute_counts,
     create_time_series_df,
     detect_patterns,
     generate_graph_model,
     prepare_graph,
 )
-from python.attribute_patterns.record_counter import RecordCounter
-
-
-def get_intro():
-    file_path = os.path.join(os.path.dirname(__file__), "README.md")
-    with open(file_path) as file:
-        return file.read()
+from toolkit.attribute_patterns.record_counter import RecordCounter
 
 
 def create(sv: ap_variables.SessionVariables, workflow):
-    intro_tab, uploader_tab, detect_tab, explain_tab = st.tabs([
-        "Attribute patterns workflow:",
-        "Create graph model",
-        "Detect patterns",
-        "Generate AI pattern reports",
-    ])
+    intro_tab, uploader_tab, detect_tab, explain_tab = st.tabs(
+        [
+            "Attribute patterns workflow:",
+            "Create graph model",
+            "Detect patterns",
+            "Generate AI pattern reports",
+        ]
+    )
     selected_pattern = ""
     graph_df = None
     with intro_tab:

diff --git a/app/workflows/data_synthesis/config.py b/app/workflows/data_synthesis/config.py
@@ -5,7 +5,7 @@
 
 import plotly.express as px
 
-from python.helpers.constants import CACHE_PATH
+from toolkit.helpers.constants import CACHE_PATH
 
 cache_dir = os.path.join(CACHE_PATH, "data_synthesis")
 outputs_dir = os.path.join(cache_dir, "outputs")

diff --git a/app/workflows/question_answering/variables.py b/app/workflows/question_answering/variables.py
@@ -1,8 +1,11 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 import random
+
 import streamlit as st
+
+import toolkit.question_answering.prompts as prompts
 from app.util.session_variable import SessionVariable
-import python.question_answering.prompts as prompts
+
 
 class SessionVariables:
     prefix = None

diff --git a/app/workflows/question_answering/workflow.py b/app/workflows/question_answering/workflow.py
@@ -8,18 +8,18 @@
 from seaborn import color_palette
 from streamlit_agraph import Config, Edge, Node, agraph
 
-import python.question_answering.input_processor as input_processor
-import python.question_answering.prompts as prompts
-import python.question_answering.question_answerer as question_answerer
+import toolkit.question_answering.input_processor as input_processor
+import toolkit.question_answering.prompts as prompts
+import toolkit.question_answering.question_answerer as question_answerer
 from app.util import ui_components
 from app.util.download_pdf import add_download_pdf
 from app.util.openai_wrapper import UIOpenAIConfiguration
 from app.util.session_variables import SessionVariables
 from app.workflows.question_answering import config
-from python.AI.base_embedder import BaseEmbedder
-from python.AI.defaults import CHUNK_SIZE
-from python.AI.local_embedder import LocalEmbedder
-from python.AI.openai_embedder import OpenAIEmbedder
+from toolkit.AI.base_embedder import BaseEmbedder
+from toolkit.AI.defaults import CHUNK_SIZE
+from toolkit.AI.local_embedder import LocalEmbedder
+from toolkit.AI.openai_embedder import OpenAIEmbedder
 
 sv_home = SessionVariables("home")
 ai_configuration = UIOpenAIConfiguration().get_configuration()

diff --git a/app/workflows/record_matching/config.py b/app/workflows/record_matching/config.py
@@ -2,7 +2,6 @@
 # Licensed under the MIT license. See LICENSE file in the project.
 #
 
-att_val_sep = "=="
 list_sep = "; "
 max_rows_to_show = 1000
 entity_label = "Entity"

diff --git a/app/workflows/record_matching/functions.py b/app/workflows/record_matching/functions.py
@@ -2,13 +2,13 @@
 # Licensed under the MIT license. See LICENSE file in the project.
 #
 import streamlit as st
+
 from app.util.openai_wrapper import UIOpenAIConfiguration
 from app.util.session_variables import SessionVariables
 from app.workflows.record_matching import config
-
-from python.AI.base_embedder import BaseEmbedder
-from python.AI.local_embedder import LocalEmbedder
-from python.AI.openai_embedder import OpenAIEmbedder
+from toolkit.AI.base_embedder import BaseEmbedder
+from toolkit.AI.local_embedder import LocalEmbedder
+from toolkit.AI.openai_embedder import OpenAIEmbedder
 
 sv_home = SessionVariables("home")