Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change fix_null_ints #72

Merged
merged 7 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 8 additions & 15 deletions app/util/df_functions.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,22 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
import math
import sys
import re
import numpy as np

import pandas as pd
from dateutil import parser as dateparser


def fix_null_ints(in_df):
def fix_null_ints(in_df: pd.DataFrame) -> pd.DataFrame:
df = in_df.copy()
for col, dt in zip(df.columns, df.dtypes, strict=False):
if dt == "float64":
idf = df[[col]].copy()
idf["float"] = [x if not np.isnan(x) else 0 for x in idf[col]]
idf["int"] = [int(x) if not np.isnan(x) else 0 for x in idf[col]]
idf["float_s"] = [x if not np.isnan(x) else -sys.maxsize for x in idf[col]]
idf["int_s"] = [
int(x) if not np.isnan(x) else -sys.maxsize for x in idf[col]
]
fsum = idf["float"].sum()
isum = idf["int"].sum()
if int(fsum) == int(isum):
df[col] = idf["int_s"]
try:
df[col] = df[col].astype("Int64")
df[col] = df[col].replace(-sys.maxsize, np.nan)
df[col] = df[col].where(pd.notna(df[col]), pd.NA)
df[col] = df[col].astype(str).replace("<NA>", "")
except Exception as e:
print(f"Error converting column {col} to Int64: {e}")

return df


Expand Down
10 changes: 2 additions & 8 deletions app/util/ui_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from toolkit.AI.classes import LLMCallback
from toolkit.AI.client import OpenAIClient
from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
from toolkit.helpers import df_functions
from toolkit.helpers.texts import clean_for_column_name


Expand Down Expand Up @@ -654,14 +655,7 @@ def prepare_stage(df_name):

if not initialized or suppress_zeros:
st.session_state[f"{workflow}_suppress_zeros"] = suppress_zeros
for col in last_df.columns:
unique_values = list([str(x) for x in last_df[col].unique()])
is_three_with_none = len(unique_values) == 3 and last_df[col].isna().any()
if len(unique_values) <= 2 or is_three_with_none:
if "0" in unique_values or "0.0" in unique_values:
this_df[col] = last_df[col].astype(str).replace("0", np.nan).replace("0.0", np.nan)
elif 'False' in unique_values:
this_df[col] = last_df[col].astype(str).replace('False', np.nan)
this_df = df_functions.supress_boolean_binary(last_df, this_df)
df_updated("suppress_null")
if not suppress_zeros:
for col in this_df.columns:
Expand Down
1,186 changes: 654 additions & 532 deletions example_notebooks/anonymize_case_data.ipynb

Large diffs are not rendered by default.

159 changes: 114 additions & 45 deletions example_notebooks/compare_case_groups.ipynb

Large diffs are not rendered by default.

76 changes: 38 additions & 38 deletions example_notebooks/detect_case_patterns.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
"/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
"source": [
"import sys\n",
"\n",
"sys.path.append(\"..\")\n",
"import os\n",
"from toolkit.detect_case_patterns import DetectCasePatterns\n",
Expand All @@ -50,6 +51,9 @@
],
"source": [
"# Create the workflow object\n",
"from toolkit.helpers import df_functions\n",
"\n",
"\n",
"dcp = DetectCasePatterns()\n",
"# Set the AI configuration\n",
"ai_configuration = OpenAIConfiguration(\n",
Expand All @@ -64,7 +68,7 @@
"data_path = \"../example_outputs/detect_case_patterns/customer_complaints/customer_complaints_prepared.csv\"\n",
"case_data = pd.read_csv(data_path)\n",
"# Map missing values and binary False to empty strings, since we only care about the presence of attributes\n",
"case_data = case_data.astype(str).replace(\"False\", \"\").replace(\"nan\", \"\")\n",
"case_data = df_functions.supress_boolean_binary(case_data)\n",
"print(\"Loaded data\")"
]
},
Expand All @@ -83,10 +87,7 @@
],
"source": [
"# Generate the graph model\n",
"dcp.generate_graph_model(\n",
" df=case_data,\n",
" period_col=\"period\"\n",
")\n",
"dcp.generate_graph_model(df=case_data, period_col=\"period\")\n",
"print(\"Generated graph model\")"
]
},
Expand Down Expand Up @@ -124,10 +125,7 @@
],
"source": [
"# Detect the case patterns\n",
"dcp.detect_patterns(\n",
" min_pattern_count=10,\n",
" max_pattern_length=5\n",
")\n",
"dcp.detect_patterns(min_pattern_count=10, max_pattern_length=5)\n",
"print(\"Detected case patterns\")"
]
},
Expand Down Expand Up @@ -219,6 +217,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Computing attribute counts for pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True with period: 2023-H1 for period column: period\n",
" AttributeValue Count\n",
"0 age_range:(50-60] 18\n",
"1 city:Springfield 18\n",
Expand All @@ -235,7 +234,7 @@
"# Compute related attribute counts for the example pattern\n",
"att_counts = dcp.compute_attribute_counts(\n",
" selected_pattern=example_pattern[\"pattern\"],\n",
" selected_pattern_period=example_pattern[\"period\"]\n",
" selected_pattern_period=example_pattern[\"period\"],\n",
")\n",
"print(att_counts)"
]
Expand All @@ -249,23 +248,23 @@
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\altair\\utils\\core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
"/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
" col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
"c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\altair\\utils\\core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
"/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
" col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n"
]
},
{
"data": {
"text/html": [
"\n",
"<div id=\"altair-viz-e72620d0fd0146d9aaecd839822393ea\"></div>\n",
"<div id=\"altair-viz-ed50583bea9b4b46bdbb8a1b166cd7fb\"></div>\n",
"<script type=\"text/javascript\">\n",
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
" (function(spec, embedOpt){\n",
" let outputDiv = document.currentScript.previousElementSibling;\n",
" if (outputDiv.id !== \"altair-viz-e72620d0fd0146d9aaecd839822393ea\") {\n",
" outputDiv = document.getElementById(\"altair-viz-e72620d0fd0146d9aaecd839822393ea\");\n",
" if (outputDiv.id !== \"altair-viz-ed50583bea9b4b46bdbb8a1b166cd7fb\") {\n",
" outputDiv = document.getElementById(\"altair-viz-ed50583bea9b4b46bdbb8a1b166cd7fb\");\n",
" }\n",
" const paths = {\n",
" \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
Expand Down Expand Up @@ -324,10 +323,10 @@
}
],
"source": [
"# Create the time series chart \n",
"# Create the time series chart\n",
"chart = dcp.create_time_series_chart(\n",
" selected_pattern=example_pattern[\"pattern\"],\n",
" selected_pattern_period=example_pattern[\"period\"]\n",
" selected_pattern_period=example_pattern[\"period\"],\n",
")\n",
"chart"
]
Expand All @@ -341,39 +340,40 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Computing attribute counts for pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True with period: 2023-H1 for period column: period\n",
"# Pattern Report\n",
"\n",
"**Pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True**\n",
"\n",
"This pattern describes a group of cases where individuals aged between 50 and 60, living in Springfield, experienced both delivery and service issues with product code G.\n",
"This pattern identifies a group of individuals aged between 50 and 60 years old, residing in Springfield, who have experienced both delivery and service issues with product code G.\n",
"\n",
"## Pattern observation\n",
"\n",
"The pattern was observed only in the first half of 2023, with 18 cases matching the pattern. In all other periods from 2020 to 2025, there were no cases matching this pattern. This sudden appearance in 2023-H1 suggests a specific issue or event during this time that affected this demographic and product combination.\n",
"The pattern was observed only in the first half of 2023, with 18 cases matching the pattern. In all other periods from 2020 to 2025, no cases were recorded. This sudden appearance in 2023-H1 suggests a specific issue or change during this time that affected this demographic and product.\n",
"\n",
"## Pattern context\n",
"\n",
"In addition to the attributes defining the pattern, some cases also reported quality issues (7 cases), description issues (4 cases), and price issues (4 cases). This information suggests that, beyond delivery and service issues, there were other concerns with the product, which might have contributed to the overall dissatisfaction.\n",
"In addition to the attributes defining the pattern, some cases also reported quality issues (7 cases), description issues (4 cases), and price issues (4 cases). This information suggests that, beyond delivery and service problems, there were also concerns about the product's quality, description, and pricing. Understanding these additional issues can help in diagnosing the root causes of the pattern.\n",
"\n",
"## Possible explanations\n",
"\n",
"1. A new batch of product G might have been released in early 2023, leading to increased delivery and service issues.\n",
"2. Changes in the delivery service provider or logistics in Springfield could have caused these issues.\n",
"3. A local event or policy change in Springfield might have impacted the delivery and service quality.\n",
"4. The demographic group aged 50-60 might have specific needs or expectations that were not met by product G.\n",
"5. There could have been a marketing campaign targeting this age group in Springfield, leading to increased purchases and subsequent issues.\n",
"6. A temporary staffing issue at the service center could have led to service problems during this period.\n",
"7. External factors, such as weather conditions or supply chain disruptions, might have affected deliveries in Springfield.\n",
"1. A change in the delivery process or service protocol for product G in Springfield during 2023-H1.\n",
"2. An increase in demand for product G among the 50-60 age group in Springfield, leading to service and delivery strains.\n",
"3. A specific event or promotion targeting this demographic in Springfield, resulting in increased purchases and subsequent issues.\n",
"4. A new supplier or logistics partner introduced in 2023-H1 that caused disruptions.\n",
"5. A regional event in Springfield that affected delivery and service operations.\n",
"6. Changes in the product itself, such as packaging or features, that led to increased complaints.\n",
"7. A data collection or reporting change in 2023-H1 that captured issues more effectively than in previous periods.\n",
"\n",
"## Suggested actions\n",
"\n",
"1. Investigate the specific causes of delivery and service issues for product G in Springfield during 2023-H1.\n",
"2. Conduct a customer satisfaction survey targeting the affected demographic to gather more insights.\n",
"3. Review and improve the logistics and service processes for product G in Springfield.\n",
"4. Provide additional training to service staff to better handle issues related to product G.\n",
"5. Consider offering compensation or discounts to affected customers to restore trust.\n",
"6. Analyze the quality control processes for product G to address any underlying quality issues.\n",
"7. Monitor the situation closely in subsequent periods to ensure the issues do not recur.\n"
"1. Investigate the delivery and service processes for product G in Springfield to identify any changes or issues that arose in 2023-H1.\n",
"2. Conduct customer feedback sessions with the affected demographic to understand their specific concerns.\n",
"3. Review the quality control measures for product G to address the reported quality issues.\n",
"4. Analyze the product description and pricing strategies to ensure they meet customer expectations.\n",
"5. Collaborate with logistics partners to improve delivery reliability in Springfield.\n",
"6. Implement targeted customer service training to better handle issues related to product G.\n",
"7. Monitor the situation in subsequent periods to ensure that the pattern does not reoccur.\n"
]
}
],
Expand All @@ -382,15 +382,15 @@
"explanation = dcp.explain_pattern(\n",
" selected_pattern=example_pattern[\"pattern\"],\n",
" selected_pattern_period=example_pattern[\"period\"],\n",
" ai_instructions=prompts.user_prompt\n",
" ai_instructions=prompts.user_prompt,\n",
")\n",
"print(explanation)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "intelligence-toolkit-lXFNld9n-py3.11",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
Expand Down
Loading
Loading