Skip to content

Commit

Permalink
separate risk networks logic (#26)
Browse files Browse the repository at this point in the history
* created risk_networks folder

* separate logic for happy path for Entity-Attribute

* happy path even on UI ok

* risk networks entity attributes create data model notebook

* add node indexer

* add type

* add infer and index nodes

* add logic in rn and tests

* change python folder to toolkit

* change folder name on code to import from toolkit folder

* fix test

* add max cluster size

* noqa on test removed

* fix max network size

* add detect notebook

* add flags rendering, logic and tests

* naming refactoring

* fix loading and connection bar to openai

* code fixes and separation

* add fixed notebooks

* add index infer notebook

* fix tests and inferring

* fixes and separation

* fix local embed

* fixed on tests

* push_before_class

* constant fixes

* import fixes and notebooks

* change main notebook

* delete report

* fixed after PR and build

* fix developing md

* fix installer
  • Loading branch information
dayesouza authored Aug 23, 2024
1 parent b60bb17 commit aef6e9f
Show file tree
Hide file tree
Showing 102 changed files with 6,091 additions and 1,956 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ __pycache__/
# Virtual environment
venv/
env/
*.lcov

# Streamlit
.cache/
Expand Down Expand Up @@ -52,6 +53,7 @@ app/wkhtmltox/*.exe
*.coverage
**/messages
**/xpia*
*.lcov

**/studio_tests/
.venv
2 changes: 1 addition & 1 deletion .vsts-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ stages:
inputs:
workingDirectory: ./
targetType: "inline"
script: pytest
script: pytest -vv
- job: buildAndPush
displayName: BuildAndPushContainer
dependsOn: validate
Expand Down
2 changes: 1 addition & 1 deletion DEVELOPING.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Open venv/Scripts/Activate.ps1, add the following lines after line 167:
$env:AZURE_OPENAI_ENDPOINT="https://<ENDPOINT>.openai.azure.com/"
```
## Running code-only
- [Attribute Patterns](./python/attribute_patterns/README.md)
- [Attribute Patterns](./toolkit/attribute_patterns/README.md)

- [Example](./examples/attribute_patterns.ipynb): See an example of how to run the code with your data to obtain results without the need to run the UI.

Expand Down
4 changes: 2 additions & 2 deletions app/pages/Settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
)
from util.secrets_handler import SecretsHandler

from python.AI.vector_store import VectorStore
from python.helpers.constants import CACHE_PATH
from toolkit.AI.vector_store import VectorStore
from toolkit.helpers.constants import CACHE_PATH


def on_change(handler, key=None, value=None):
Expand Down
2 changes: 1 addition & 1 deletion app/util/openai_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.
#
from python.AI.openai_configuration import OpenAIConfiguration
from toolkit.AI.openai_configuration import OpenAIConfiguration

from .secrets_handler import SecretsHandler

Expand Down
53 changes: 34 additions & 19 deletions app/util/ui_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from app.util.enums import Mode
from app.util.openai_wrapper import UIOpenAIConfiguration

import python.AI.utils as utils
from python.AI.classes import LLMCallback
from python.AI.client import OpenAIClient
from python.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
import toolkit.AI.utils as utils
from toolkit.AI.classes import LLMCallback
from toolkit.AI.client import OpenAIClient
from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS


def return_token_count(text: str) -> int:
Expand Down Expand Up @@ -145,11 +145,13 @@ def generative_batch_ai_component(
batch_count = batch_count_raw + 1 if batch_count_remaining != 0 else batch_count_raw
batch_messages = []

full_prompt = " ".join([
system_prompt_var.value["report_prompt"],
instructions_text,
system_prompt_var.value["safety_prompt"],
])
full_prompt = " ".join(
[
system_prompt_var.value["report_prompt"],
instructions_text,
system_prompt_var.value["safety_prompt"],
]
)
for _i in range(batch_count):
batch = batch_val[batch_offset : min(batch_offset + batch_size, len(batch_val))]
batch_offset += batch_size
Expand Down Expand Up @@ -685,17 +687,21 @@ def prepare_input_df(
for _i, row in melted.iterrows():
if row["Attribute"] in expanded_atts:
if str(row["Value"]) not in ["", "<NA>"]:
new_rows.append([
row["Subject ID"],
row["Attribute"] + "_" + str(row["Value"]),
"1",
])
new_rows.append(
[
row["Subject ID"],
row["Attribute"] + "_" + str(row["Value"]),
"1",
]
)
else:
new_rows.append([
row["Subject ID"],
row["Attribute"],
str(row["Value"]),
])
new_rows.append(
[
row["Subject ID"],
row["Attribute"],
str(row["Value"]),
]
)
melted = pd.DataFrame(
new_rows, columns=["Subject ID", "Attribute", "Value"]
)
Expand Down Expand Up @@ -750,6 +756,15 @@ def on(text):
return on_callback


def remove_connection_bar(fn):
def on(_):
fn(_)

on_callback = LLMCallback()
on_callback.on_llm_new_token = on
return on_callback


def build_validation_ui(
report_validation, attribute_report_validation_messages, report_data, file_name
):
Expand Down
20 changes: 1 addition & 19 deletions app/workflows/attribute_patterns/README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,3 @@
# Attribute Patterns

The **Attribute Patterns** workflow generates intelligence reports on attribute patterns detected in streams of case records.

## How it works

1. [**Input**] Case records representing categorical attributes of data subjects observed at a point time. Units are treated as anonymous and independent.
2. [**Process**] Categorical attributes are modelled as a dynamic graph, where nodes represent attribute values in a given time window and edges represent the co-occurrences of attribute values.
3. [**Process**] A technique called [Graph Fusion Encoder Embedding](https://arxiv.org/abs/2303.18051) is used to embed the dynamic attribute graph into a multi-dimensional space.
4. [**Process**] Within each time period, attribute patterns are detected as combinations of attributes all moving towards one another in the embedding space.
5. [**Output**] Attribute patterns CSV file. Can be created and used independently without any AI or embedding calls.
6. [**AI Calls**] For patterns of interest selected by the user, generative AI is used to create AI pattern reports.
7. [**Output**] AI pattern report MD/PDF file(s) describing the nature of the pattern, its progression over time, top co-occurring attribute values, possible explanations, and suggested actions.

## Input requirements

- The input data file should be in CSV format and represent individual data subjects.
- Individual data subjects may be represented by a single row, in which case no identifier is required, or by multiple rows, in which case an identifier is required to link these rows into a single record.
- For attribute pattern detection, each individual must be represented as a collection of discrete (i.e., categorical or binary) attributes. Any continuous attributes must first be quantized via the user interface.
- Given the goal of identifying attribute patterns, no direct identifiers (e.g., names, aliases, ids, phone numbers, email addresses, street addresses) should be included in data outputs. Following the principle of [data minimization](https://en.wikipedia.org/wiki/Data_minimization), such direct identifiers should be removed from data inputs because they are not required for the processing purpose and create unnecessary risks for the data subject. Tools such as Microsoft Excel can be used to delete any direct identifier columns prior to use in Intelligence Toolkit.
- First converting any sensitive input dataset to a synthetic dataset using the Data Synthesis workflow will ensure that any detected attribute patterns can be safely shared without compromising the privacy of data subjects.
[Go to main code folder](../../../toolkit/attribute_patterns/README.md)
2 changes: 1 addition & 1 deletion app/workflows/attribute_patterns/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import streamlit as st
from app.util.session_variable import SessionVariable

import python.attribute_patterns.prompts as prompts
import toolkit.attribute_patterns.prompts as prompts


class SessionVariables:
Expand Down
31 changes: 13 additions & 18 deletions app/workflows/attribute_patterns/workflow.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.
#
import os

import altair as alt
import streamlit as st
import app.workflows.attribute_patterns.variables as ap_variables
Expand All @@ -15,31 +13,28 @@
)
from app.util import ui_components

from python.attribute_patterns import prompts
from python.attribute_patterns.embedding import generate_embedding
from python.attribute_patterns.model import (
from toolkit.attribute_patterns import get_readme as get_intro
from toolkit.attribute_patterns import prompts
from toolkit.attribute_patterns.embedding import generate_embedding
from toolkit.attribute_patterns.model import (
compute_attribute_counts,
create_time_series_df,
detect_patterns,
generate_graph_model,
prepare_graph,
)
from python.attribute_patterns.record_counter import RecordCounter


def get_intro():
file_path = os.path.join(os.path.dirname(__file__), "README.md")
with open(file_path) as file:
return file.read()
from toolkit.attribute_patterns.record_counter import RecordCounter


def create(sv: ap_variables.SessionVariables, workflow):
intro_tab, uploader_tab, detect_tab, explain_tab = st.tabs([
"Attribute patterns workflow:",
"Create graph model",
"Detect patterns",
"Generate AI pattern reports",
])
intro_tab, uploader_tab, detect_tab, explain_tab = st.tabs(
[
"Attribute patterns workflow:",
"Create graph model",
"Detect patterns",
"Generate AI pattern reports",
]
)
selected_pattern = ""
graph_df = None
with intro_tab:
Expand Down
2 changes: 1 addition & 1 deletion app/workflows/data_synthesis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import plotly.express as px

from python.helpers.constants import CACHE_PATH
from toolkit.helpers.constants import CACHE_PATH

cache_dir = os.path.join(CACHE_PATH, "data_synthesis")
outputs_dir = os.path.join(cache_dir, "outputs")
Expand Down
5 changes: 4 additions & 1 deletion app/workflows/question_answering/variables.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
import random

import streamlit as st

import toolkit.question_answering.prompts as prompts
from app.util.session_variable import SessionVariable
import python.question_answering.prompts as prompts


class SessionVariables:
prefix = None
Expand Down
14 changes: 7 additions & 7 deletions app/workflows/question_answering/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
from seaborn import color_palette
from streamlit_agraph import Config, Edge, Node, agraph

import python.question_answering.input_processor as input_processor
import python.question_answering.prompts as prompts
import python.question_answering.question_answerer as question_answerer
import toolkit.question_answering.input_processor as input_processor
import toolkit.question_answering.prompts as prompts
import toolkit.question_answering.question_answerer as question_answerer
from app.util import ui_components
from app.util.download_pdf import add_download_pdf
from app.util.openai_wrapper import UIOpenAIConfiguration
from app.util.session_variables import SessionVariables
from app.workflows.question_answering import config
from python.AI.base_embedder import BaseEmbedder
from python.AI.defaults import CHUNK_SIZE
from python.AI.local_embedder import LocalEmbedder
from python.AI.openai_embedder import OpenAIEmbedder
from toolkit.AI.base_embedder import BaseEmbedder
from toolkit.AI.defaults import CHUNK_SIZE
from toolkit.AI.local_embedder import LocalEmbedder
from toolkit.AI.openai_embedder import OpenAIEmbedder

sv_home = SessionVariables("home")
ai_configuration = UIOpenAIConfiguration().get_configuration()
Expand Down
1 change: 0 additions & 1 deletion app/workflows/record_matching/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Licensed under the MIT license. See LICENSE file in the project.
#

att_val_sep = "=="
list_sep = "; "
max_rows_to_show = 1000
entity_label = "Entity"
Expand Down
8 changes: 4 additions & 4 deletions app/workflows/record_matching/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# Licensed under the MIT license. See LICENSE file in the project.
#
import streamlit as st

from app.util.openai_wrapper import UIOpenAIConfiguration
from app.util.session_variables import SessionVariables
from app.workflows.record_matching import config

from python.AI.base_embedder import BaseEmbedder
from python.AI.local_embedder import LocalEmbedder
from python.AI.openai_embedder import OpenAIEmbedder
from toolkit.AI.base_embedder import BaseEmbedder
from toolkit.AI.local_embedder import LocalEmbedder
from toolkit.AI.openai_embedder import OpenAIEmbedder

sv_home = SessionVariables("home")

Expand Down
Loading

0 comments on commit aef6e9f

Please sign in to comment.