Skip to content

Commit

Permalink
Ccg api (microsoft#66)
Browse files Browse the repository at this point in the history
* move detect to api

* change ccg api to use polars only

* fix ranked

* add example notebook

* change fn names

* fix pr comment

* fix fn

* add example

* add notebook run
  • Loading branch information
dayesouza authored and scrt-dev committed Oct 30, 2024
1 parent ebba3b1 commit bf47899
Show file tree
Hide file tree
Showing 14 changed files with 713 additions and 223 deletions.
3 changes: 0 additions & 3 deletions app/workflows/anonymize_case_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project.

from toolkit.anonymize_case_data.api import AnonymizeCaseData
from toolkit.anonymize_case_data.error_report import ErrorReport
4 changes: 3 additions & 1 deletion app/workflows/compare_case_groups/variables.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
import random

import pandas as pd
import streamlit as st

import toolkit.compare_case_groups.prompts as prompts
from app.util.session_variable import SessionVariable
from toolkit.compare_case_groups.api import CompareCaseGroups


class SessionVariables:
Expand All @@ -16,6 +16,8 @@ def __init__(self, prefix):
self.create_session(prefix)

def create_session(self, prefix):
self.workflow_object = SessionVariable(CompareCaseGroups(), prefix)

self.case_groups_input_df = SessionVariable(pd.DataFrame(), prefix)
self.case_groups_binned_df = SessionVariable(pd.DataFrame(), prefix)
self.case_groups_final_df = SessionVariable(pd.DataFrame(), prefix)
Expand Down
152 changes: 18 additions & 134 deletions app/workflows/compare_case_groups/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,7 @@
import app.workflows.compare_case_groups.variables as gn_variables
from app.util import ui_components
from app.util.download_pdf import add_download_pdf
from toolkit.compare_case_groups import prompts
from toolkit.compare_case_groups.build_dataframes import (
build_attribute_df,
build_grouped_df,
build_ranked_df,
filter_df,
)
from toolkit.compare_case_groups.temporal_process import (
build_temporal_data,
create_window_df,
)
from toolkit.compare_case_groups import CompareCaseGroups, prompts
from toolkit.helpers.df_functions import fix_null_ints


Expand All @@ -42,6 +32,7 @@ def create(sv: gn_variables.SessionVariables, workflow=None):
"View example outputs"
]
)
ccg: CompareCaseGroups = CompareCaseGroups()

with intro_tab:
file_content = get_intro()
Expand Down Expand Up @@ -134,116 +125,15 @@ def create(sv: gn_variables.SessionVariables, workflow=None):
sv.case_groups_aggregates.value = aggregates
sv.case_groups_temporal.value = temporal

for group in sv.case_groups_groups.value:
sv.case_groups_final_df.value = sv.case_groups_final_df.value[
sv.case_groups_final_df.value[group] != ""
]

sv.case_groups_model_df.value = sv.case_groups_final_df.value.copy(
deep=True
)

sv.case_groups_model_df.value = (
sv.case_groups_model_df.value.replace("", None)
)
initial_row_count = len(sv.case_groups_model_df.value)

filtered_df = (
filter_df(sv.case_groups_model_df.value, filters)
if len(filters) > 0
else sv.case_groups_model_df.value
)

grouped_df = build_grouped_df(filtered_df, groups)

attributes_df = build_attribute_df(
pl.from_pandas(filtered_df), groups, aggregates
)

temporal_df = pl.DataFrame()
temporal_atts = []
# create Window df
if temporal is not None and temporal != "":
window_df = create_window_df(
groups, temporal, aggregates, pl.from_pandas(filtered_df)
)

temporal_atts = sorted(
sv.case_groups_model_df.value[temporal].astype(str).unique()
)

temporal_df = build_temporal_data(
window_df, groups, temporal_atts, temporal
)
# Create overall df
ranked_df = build_ranked_df(
temporal_df,
pl.from_pandas(grouped_df),
attributes_df,
temporal or "",
ccg.create_data_summary(
pl.from_pandas(sv.case_groups_final_df.value),
filters,
groups,
aggregates,
temporal,
)
ranked_df = ranked_df.to_pandas()

sv.case_groups_model_df.value = (
ranked_df[
[
*[g.lower() for g in groups],
"group_count",
"group_rank",
"attribute_value",
"attribute_count",
"attribute_rank",
f"{temporal}_window",
f"{temporal}_window_count",
f"{temporal}_window_rank",
f"{temporal}_window_delta",
]
]
if temporal != ""
else ranked_df[
[
*[g.lower() for g in groups],
"group_count",
"group_rank",
"attribute_value",
"attribute_count",
"attribute_rank",
]
]
)
groups_text = (
"[" + ", ".join(["**" + g + "**" for g in groups]) + "]"
)
filters_text = (
"["
+ ", ".join(
["**" + f.replace(":", "\\:") + "**" for f in filters]
)
+ "]"
)

filtered_row_count = len(filtered_df)
dataset_proportion = int(
round(
100 * filtered_row_count / initial_row_count
if initial_row_count > 0
else 0,
0,
)
)
description = "This table shows:"
description += (
f"\n- A summary of **{len(filtered_df)}** data records matching {filters_text}, representing **{dataset_proportion}%** of the overall dataset with values for all grouping attributes"
if len(filters) > 0
else f"\n- A summary of all **{initial_row_count}** data records with values for all grouping attributes"
)
description += f"\n- The **group_count** of records for all {groups_text} groups, and corresponding **group_rank**"
description += f"\n- The **attribute_count** of each **attribute_value** for all {groups_text} groups, and corresponding **attribute_rank**"
if temporal != "":
description += f"\n- The **{temporal}_window_count** of each **attribute_value** for each **{temporal}_window** for all {groups_text} groups, and corresponding **{temporal}_window_rank**"
description += f"\n- The **{temporal}_window_delta**, or change in the **attribute_value_count** for successive **{temporal}_window** values, within each {groups_text} group"
sv.case_groups_description.value = description
sv.case_groups_description.value = ccg.get_summary_description()
sv.case_groups_model_df.value = ccg.model_df.to_pandas()
st.rerun()
if len(sv.case_groups_model_df.value) > 0:
st.dataframe(
Expand Down Expand Up @@ -290,27 +180,21 @@ def create(sv: gn_variables.SessionVariables, workflow=None):
value=sv.case_groups_top_groups.value,
)
fdf = sv.case_groups_model_df.value.copy(deep=True)
filter_description = ""
if len(selected_groups) > 0:
fdf = fdf[
fdf.set_index(sv.case_groups_groups.value).index.isin(
selected_groups
)
]
filter_description = f'Filtered to the following groups only: {", ".join([str(s) for s in selected_groups])}'
elif top_group_ranks:
fdf = fdf[fdf["group_rank"] <= top_group_ranks]
filter_description = (
f"Filtered to the top {top_group_ranks} groups by record count"
)
num_rows = len(fdf)
report_data, filter_description = ccg.get_report_data(
selected_groups if len(selected_groups) > 0 else None,
top_group_ranks if top_group_ranks > 0 else None,
)
num_rows = len(report_data)
st.markdown(f"##### Filtered data summary to report on ({num_rows} rows)")
st.dataframe(fdf, hide_index=True, use_container_width=True, height=280)
variables = {
"description": sv.case_groups_description.value,
"dataset": fdf.to_csv(index=False, encoding="utf-8-sig"),
"dataset": report_data.to_pandas().to_csv(
index=False, encoding="utf-8-sig"
),
"filters": filter_description,
}

generate, messages, reset = ui_components.generative_ai_component(
sv.case_groups_system_prompt, variables
)
Expand Down
Loading

0 comments on commit bf47899

Please sign in to comment.