microsoft · dayesouza · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
@@ -1,29 +1,22 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 import math
-import sys
 import re
-import numpy as np
+
 import pandas as pd
 from dateutil import parser as dateparser
 
 
-def fix_null_ints(in_df):
+def fix_null_ints(in_df: pd.DataFrame) -> pd.DataFrame:
     df = in_df.copy()
     for col, dt in zip(df.columns, df.dtypes, strict=False):
         if dt == "float64":
-            idf = df[[col]].copy()
-            idf["float"] = [x if not np.isnan(x) else 0 for x in idf[col]]
-            idf["int"] = [int(x) if not np.isnan(x) else 0 for x in idf[col]]
-            idf["float_s"] = [x if not np.isnan(x) else -sys.maxsize for x in idf[col]]
-            idf["int_s"] = [
-                int(x) if not np.isnan(x) else -sys.maxsize for x in idf[col]
-            ]
-            fsum = idf["float"].sum()
-            isum = idf["int"].sum()
-            if int(fsum) == int(isum):
-                df[col] = idf["int_s"]
+            try:
                 df[col] = df[col].astype("Int64")
-                df[col] = df[col].replace(-sys.maxsize, np.nan)
+                df[col] = df[col].where(pd.notna(df[col]), pd.NA)
+                df[col] = df[col].astype(str).replace("<NA>", "")
+            except Exception as e:
+                print(f"Error converting column {col} to Int64: {e}")
+
     return df
 
 

@@ -19,6 +19,7 @@
 from toolkit.AI.classes import LLMCallback
 from toolkit.AI.client import OpenAIClient
 from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS
+from toolkit.helpers import df_functions
 from toolkit.helpers.texts import clean_for_column_name
 
 
@@ -654,14 +655,7 @@ def prepare_stage(df_name):
 
         if not initialized or suppress_zeros:
             st.session_state[f"{workflow}_suppress_zeros"] = suppress_zeros
-            for col in last_df.columns:
-                unique_values = list([str(x) for x in last_df[col].unique()])
-                is_three_with_none = len(unique_values) == 3 and last_df[col].isna().any()
-                if len(unique_values) <= 2 or is_three_with_none:
-                    if "0" in unique_values or "0.0" in unique_values:
-                        this_df[col] = last_df[col].astype(str).replace("0", np.nan).replace("0.0", np.nan)
-                    elif 'False' in unique_values:
-                        this_df[col] = last_df[col].astype(str).replace('False', np.nan)
+            this_df = df_functions.supress_boolean_binary(last_df, this_df)
             df_updated("suppress_null")
         if not suppress_zeros:
             for col in this_df.columns:

@@ -20,13 +20,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+      "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm, trange\n"
      ]
     }
    ],
    "source": [
     "import sys\n",
+    "\n",
     "sys.path.append(\"..\")\n",
     "import os\n",
     "from toolkit.detect_case_patterns import DetectCasePatterns\n",
@@ -50,6 +51,9 @@
    ],
    "source": [
     "# Create the workflow object\n",
+    "from toolkit.helpers import df_functions\n",
+    "\n",
+    "\n",
     "dcp = DetectCasePatterns()\n",
     "# Set the AI configuration\n",
     "ai_configuration = OpenAIConfiguration(\n",
@@ -64,7 +68,7 @@
     "data_path = \"../example_outputs/detect_case_patterns/customer_complaints/customer_complaints_prepared.csv\"\n",
     "case_data = pd.read_csv(data_path)\n",
     "# Map missing values and binary False to empty strings, since we only care about the presence of attributes\n",
-    "case_data = case_data.astype(str).replace(\"False\", \"\").replace(\"nan\", \"\")\n",
+    "case_data = df_functions.supress_boolean_binary(case_data)\n",
     "print(\"Loaded data\")"
    ]
   },
@@ -83,10 +87,7 @@
    ],
    "source": [
     "# Generate the graph model\n",
-    "dcp.generate_graph_model(\n",
-    "    df=case_data,\n",
-    "    period_col=\"period\"\n",
-    ")\n",
+    "dcp.generate_graph_model(df=case_data, period_col=\"period\")\n",
     "print(\"Generated graph model\")"
    ]
   },
@@ -124,10 +125,7 @@
    ],
    "source": [
     "# Detect the case patterns\n",
-    "dcp.detect_patterns(\n",
-    "    min_pattern_count=10,\n",
-    "    max_pattern_length=5\n",
-    ")\n",
+    "dcp.detect_patterns(min_pattern_count=10, max_pattern_length=5)\n",
     "print(\"Detected case patterns\")"
    ]
   },
@@ -219,6 +217,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Computing attribute counts for pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True with period: 2023-H1 for period column: period\n",
       "           AttributeValue  Count\n",
       "0       age_range:(50-60]     18\n",
       "1        city:Springfield     18\n",
@@ -235,7 +234,7 @@
     "# Compute related attribute counts for the example pattern\n",
     "att_counts = dcp.compute_attribute_counts(\n",
     "    selected_pattern=example_pattern[\"pattern\"],\n",
-    "    selected_pattern_period=example_pattern[\"period\"]\n",
+    "    selected_pattern_period=example_pattern[\"period\"],\n",
     ")\n",
     "print(att_counts)"
    ]
@@ -249,23 +248,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\altair\\utils\\core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
       "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n",
-      "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\altair\\utils\\core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
+      "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n",
       "  col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n"
      ]
     },
     {
      "data": {
       "text/html": [
        "\n",
-       "<div id=\"altair-viz-e72620d0fd0146d9aaecd839822393ea\"></div>\n",
+       "<div id=\"altair-viz-ed50583bea9b4b46bdbb8a1b166cd7fb\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-e72620d0fd0146d9aaecd839822393ea\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-e72620d0fd0146d9aaecd839822393ea\");\n",
+       "    if (outputDiv.id !== \"altair-viz-ed50583bea9b4b46bdbb8a1b166cd7fb\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-ed50583bea9b4b46bdbb8a1b166cd7fb\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
@@ -324,10 +323,10 @@
     }
    ],
    "source": [
-    "# Create the time series chart \n",
+    "# Create the time series chart\n",
     "chart = dcp.create_time_series_chart(\n",
     "    selected_pattern=example_pattern[\"pattern\"],\n",
-    "    selected_pattern_period=example_pattern[\"period\"]\n",
+    "    selected_pattern_period=example_pattern[\"period\"],\n",
     ")\n",
     "chart"
    ]
@@ -341,39 +340,40 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Computing attribute counts for pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True with period: 2023-H1 for period column: period\n",
       "# Pattern Report\n",
       "\n",
       "**Pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True**\n",
       "\n",
-      "This pattern describes a group of cases where individuals aged between 50 and 60, living in Springfield, experienced both delivery and service issues with product code G.\n",
+      "This pattern identifies a group of individuals aged between 50 and 60 years old, residing in Springfield, who have experienced both delivery and service issues with product code G.\n",
       "\n",
       "## Pattern observation\n",
       "\n",
-      "The pattern was observed only in the first half of 2023, with 18 cases matching the pattern. In all other periods from 2020 to 2025, there were no cases matching this pattern. This sudden appearance in 2023-H1 suggests a specific issue or event during this time that affected this demographic and product combination.\n",
+      "The pattern was observed only in the first half of 2023, with 18 cases matching the pattern. In all other periods from 2020 to 2025, no cases were recorded. This sudden appearance in 2023-H1 suggests a specific issue or change during this time that affected this demographic and product.\n",
       "\n",
       "## Pattern context\n",
       "\n",
-      "In addition to the attributes defining the pattern, some cases also reported quality issues (7 cases), description issues (4 cases), and price issues (4 cases). This information suggests that, beyond delivery and service issues, there were other concerns with the product, which might have contributed to the overall dissatisfaction.\n",
+      "In addition to the attributes defining the pattern, some cases also reported quality issues (7 cases), description issues (4 cases), and price issues (4 cases). This information suggests that, beyond delivery and service problems, there were also concerns about the product's quality, description, and pricing. Understanding these additional issues can help in diagnosing the root causes of the pattern.\n",
       "\n",
       "## Possible explanations\n",
       "\n",
-      "1. A new batch of product G might have been released in early 2023, leading to increased delivery and service issues.\n",
-      "2. Changes in the delivery service provider or logistics in Springfield could have caused these issues.\n",
-      "3. A local event or policy change in Springfield might have impacted the delivery and service quality.\n",
-      "4. The demographic group aged 50-60 might have specific needs or expectations that were not met by product G.\n",
-      "5. There could have been a marketing campaign targeting this age group in Springfield, leading to increased purchases and subsequent issues.\n",
-      "6. A temporary staffing issue at the service center could have led to service problems during this period.\n",
-      "7. External factors, such as weather conditions or supply chain disruptions, might have affected deliveries in Springfield.\n",
+      "1. A change in the delivery process or service protocol for product G in Springfield during 2023-H1.\n",
+      "2. An increase in demand for product G among the 50-60 age group in Springfield, leading to service and delivery strains.\n",
+      "3. A specific event or promotion targeting this demographic in Springfield, resulting in increased purchases and subsequent issues.\n",
+      "4. A new supplier or logistics partner introduced in 2023-H1 that caused disruptions.\n",
+      "5. A regional event in Springfield that affected delivery and service operations.\n",
+      "6. Changes in the product itself, such as packaging or features, that led to increased complaints.\n",
+      "7. A data collection or reporting change in 2023-H1 that captured issues more effectively than in previous periods.\n",
       "\n",
       "## Suggested actions\n",
       "\n",
-      "1. Investigate the specific causes of delivery and service issues for product G in Springfield during 2023-H1.\n",
-      "2. Conduct a customer satisfaction survey targeting the affected demographic to gather more insights.\n",
-      "3. Review and improve the logistics and service processes for product G in Springfield.\n",
-      "4. Provide additional training to service staff to better handle issues related to product G.\n",
-      "5. Consider offering compensation or discounts to affected customers to restore trust.\n",
-      "6. Analyze the quality control processes for product G to address any underlying quality issues.\n",
-      "7. Monitor the situation closely in subsequent periods to ensure the issues do not recur.\n"
+      "1. Investigate the delivery and service processes for product G in Springfield to identify any changes or issues that arose in 2023-H1.\n",
+      "2. Conduct customer feedback sessions with the affected demographic to understand their specific concerns.\n",
+      "3. Review the quality control measures for product G to address the reported quality issues.\n",
+      "4. Analyze the product description and pricing strategies to ensure they meet customer expectations.\n",
+      "5. Collaborate with logistics partners to improve delivery reliability in Springfield.\n",
+      "6. Implement targeted customer service training to better handle issues related to product G.\n",
+      "7. Monitor the situation in subsequent periods to ensure that the pattern does not reoccur.\n"
      ]
     }
    ],
@@ -382,15 +382,15 @@
     "explanation = dcp.explain_pattern(\n",
     "    selected_pattern=example_pattern[\"pattern\"],\n",
     "    selected_pattern_period=example_pattern[\"period\"],\n",
-    "    ai_instructions=prompts.user_prompt\n",
+    "    ai_instructions=prompts.user_prompt,\n",
     ")\n",
     "print(explanation)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-toolkit-lXFNld9n-py3.11",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },