diff --git a/app/util/df_functions.py b/app/util/df_functions.py index 36a24861..72101c8c 100644 --- a/app/util/df_functions.py +++ b/app/util/df_functions.py @@ -1,29 +1,22 @@ # Copyright (c) 2024 Microsoft Corporation. All rights reserved. import math -import sys import re -import numpy as np + import pandas as pd from dateutil import parser as dateparser -def fix_null_ints(in_df): +def fix_null_ints(in_df: pd.DataFrame) -> pd.DataFrame: df = in_df.copy() for col, dt in zip(df.columns, df.dtypes, strict=False): if dt == "float64": - idf = df[[col]].copy() - idf["float"] = [x if not np.isnan(x) else 0 for x in idf[col]] - idf["int"] = [int(x) if not np.isnan(x) else 0 for x in idf[col]] - idf["float_s"] = [x if not np.isnan(x) else -sys.maxsize for x in idf[col]] - idf["int_s"] = [ - int(x) if not np.isnan(x) else -sys.maxsize for x in idf[col] - ] - fsum = idf["float"].sum() - isum = idf["int"].sum() - if int(fsum) == int(isum): - df[col] = idf["int_s"] + try: df[col] = df[col].astype("Int64") - df[col] = df[col].replace(-sys.maxsize, np.nan) + df[col] = df[col].where(pd.notna(df[col]), pd.NA) + df[col] = df[col].astype(str).replace("", "") + except Exception as e: + print(f"Error converting column {col} to Int64: {e}") + return df diff --git a/app/util/ui_components.py b/app/util/ui_components.py index b3f79b89..f90245d5 100644 --- a/app/util/ui_components.py +++ b/app/util/ui_components.py @@ -19,6 +19,7 @@ from toolkit.AI.classes import LLMCallback from toolkit.AI.client import OpenAIClient from toolkit.AI.defaults import DEFAULT_MAX_INPUT_TOKENS +from toolkit.helpers import df_functions from toolkit.helpers.texts import clean_for_column_name @@ -654,14 +655,7 @@ def prepare_stage(df_name): if not initialized or suppress_zeros: st.session_state[f"{workflow}_suppress_zeros"] = suppress_zeros - for col in last_df.columns: - unique_values = list([str(x) for x in last_df[col].unique()]) - is_three_with_none = len(unique_values) == 3 and last_df[col].isna().any() - if len(unique_values) <= 2 or is_three_with_none: - if "0" in unique_values or "0.0" in unique_values: - this_df[col] = last_df[col].astype(str).replace("0", np.nan).replace("0.0", np.nan) - elif 'False' in unique_values: - this_df[col] = last_df[col].astype(str).replace('False', np.nan) + this_df = df_functions.supress_boolean_binary(last_df, this_df) df_updated("suppress_null") if not suppress_zeros: for col in this_df.columns: diff --git a/example_notebooks/anonymize_case_data.ipynb b/example_notebooks/anonymize_case_data.ipynb index 5b143213..f1777781 100644 --- a/example_notebooks/anonymize_case_data.ipynb +++ b/example_notebooks/anonymize_case_data.ipynb @@ -15,11 +15,25 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from tqdm.autonotebook import tqdm, trange\n" + ] + } + ], "source": [ "import sys\n", + "\n", "sys.path.append(\"..\")\n", - "from toolkit.anonymize_case_data import AnonymizeCaseData, SynthesizabilityStatistics, color_schemes\n", + "from toolkit.anonymize_case_data import (\n", + " AnonymizeCaseData,\n", + " SynthesizabilityStatistics,\n", + " color_schemes,\n", + ")\n", "import pandas as pd" ] }, @@ -38,12 +52,16 @@ ], "source": [ "# Create the workflow object\n", + "\n", + "from toolkit.helpers import df_functions\n", + "\n", + "\n", "acd = AnonymizeCaseData()\n", "# Load the sensitive data\n", "data_path = \"../example_outputs/anonymize_case_data/customer_complaints/customer_complaints_prepared.csv\"\n", "sensitive_data = pd.read_csv(data_path)\n", "# Map missing values and binary False to empty strings, since we only care about the presence of attributes\n", - "sensitive_data = sensitive_data.astype(str).replace(\"False\", \"\").replace(\"nan\", \"\")\n", + "sensitive_data = df_functions.supress_boolean_binary(sensitive_data)\n", "print(\"Loaded data\")" ] }, @@ -62,7 +80,9 @@ ], "source": [ "# Check the synthesizabiluty of the data\n", - "synthesizability_stats: SynthesizabilityStatistics = acd.analyze_synthesizability(sensitive_data)\n", + "synthesizability_stats: SynthesizabilityStatistics = acd.analyze_synthesizability(\n", + " sensitive_data\n", + ")\n", "print(synthesizability_stats)" ] }, @@ -94,68 +114,94 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " selections protected_count\n", - "7161 record_count 3115\n", - "7072 age_range:(30-40] 1285\n", - "4221 period:2023-H2 1085\n", - "1689 period:2023-H1 1069\n", - "1149 quality_issue:True 957\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
selectionsprotected_count
6431record_count3007
6357age_range:(30-40]1303
1519period:2023-H11076
3747period:2023-H21076
2812service_issue:True954
\n", + "
" + ], + "text/plain": [ + " selections protected_count\n", + "6431 record_count 3007\n", + "6357 age_range:(30-40] 1303\n", + "1519 period:2023-H1 1076\n", + "3747 period:2023-H2 1076\n", + "2812 service_issue:True 954" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "# Inspect the anonymous aggregate data\n", - "print(acd.aggregate_df.head())" + "acd.aggregate_df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city age_range price_issue quality_issue service_issue \\\n", - "0 Mountainview (40-50] True \n", - "1 Mountainview (40-50] True \n", - "2 (20-30] True \n", - "3 (20-30] True \n", - "4 (20-30] True \n", - "\n", - " delivery_issue description_issue product_code period \n", - "0 True True C 2023-H1 \n", - "1 True True C 2023-H2 \n", - "2 True True C 2023-H2 \n", - "3 True True C 2023-H2 \n", - "4 True True C 2023-H1 \n" - ] - } - ], - "source": [ - "# Inspect the anonymous synthetic data\n", - "print(acd.synthetic_df.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Length Count +/- Error Suppressed % Fabricated %\n", - "0 1 160.66 +/- 5.72 4.07 % 0.00 %\n", - "1 2 23.85 +/- 5.19 9.19 % 0.21 %\n", - "2 3 6.85 +/- 3.66 17.78 % 2.61 %\n", - "3 4 2.85 +/- 2.22 36.81 % 7.84 %\n", - "4 Overall 6.88 +/- 3.07 17.95 % 2.53 %\n" + "0 1 160.66 +/- 6.45 4.68 % 0.00 %\n", + "1 2 23.85 +/- 5.18 9.85 % 0.10 %\n", + "2 3 6.85 +/- 4.63 19.43 % 3.45 %\n", + "3 4 2.85 +/- 2.87 42.14 % 7.54 %\n", + "4 Overall 6.88 +/- 3.84 20.04 % 2.71 %\n" ] } ], @@ -172,38 +218,38 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " Length Count +/- Error Suppressed % Fabricated %\n", - "0 1 160.66 +/- 5.72 4.07 % 0.00 %\n", - "1 2 23.85 +/- 11.05 9.19 % 0.11 %\n", - "2 3 6.85 +/- 5.19 18.58 % 1.29 %\n", - "3 4 2.85 +/- 2.65 40.03 % 4.69 %\n", - "4 Overall 6.88 +/- 4.54 18.98 % 1.14 %\n" - ] - } - ], - "source": [ - "# Inspect the error report for the synthetic data\n", - "# Length represents the length of the attribute value combination being counted\n", - "# Error represents the mean absolute error in the count of the attribute value combination,\n", - "# calculated as the absolute difference between the actual count and the anonymized/protected count divided by the actual count\n", - "# Suppressed % represents the percentage of attribute value combination counts that were suppressed, out of the total count of attribute value combinations\n", - "# Fabricated % represents the percentage of attribute value combination counts that were fabricated, out of the total count of attribute value combinations\n", - "\n", - "print(acd.synthetic_error_report)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.plotly.v1+json": { @@ -235,9 +281,9 @@ ], "xaxis": "x", "y": [ - 1285, - 767, - 764 + 1303, + 780, + 749 ], "yaxis": "y" }, @@ -264,60 +310,60 @@ ], "xaxis": "x", "y": [ - 1085, - 1069 + 1076, + 1076 ], "yaxis": "y" }, { "alignmentgroup": "True", - "hovertemplate": "Attribute=quality_issue
Attribute Value=%{x}
Count=%{y}", - "legendgroup": "quality_issue", + "hovertemplate": "Attribute=service_issue
Attribute Value=%{x}
Count=%{y}", + "legendgroup": "service_issue", "marker": { "color": "#85660D", "pattern": { "shape": "" } }, - "name": "quality_issue", - "offsetgroup": "quality_issue", + "name": "service_issue", + "offsetgroup": "service_issue", "orientation": "v", "showlegend": true, "textposition": "auto", "texttemplate": "%{y}", "type": "bar", "x": [ - "quality_issue:True" + "service_issue:True" ], "xaxis": "x", "y": [ - 957 + 954 ], "yaxis": "y" }, { "alignmentgroup": "True", - "hovertemplate": "Attribute=service_issue
Attribute Value=%{x}
Count=%{y}", - "legendgroup": "service_issue", + "hovertemplate": "Attribute=quality_issue
Attribute Value=%{x}
Count=%{y}", + "legendgroup": "quality_issue", "marker": { "color": "#782AB6", "pattern": { "shape": "" } }, - "name": "service_issue", - "offsetgroup": "service_issue", + "name": "quality_issue", + "offsetgroup": "quality_issue", "orientation": "v", "showlegend": true, "textposition": "auto", "texttemplate": "%{y}", "type": "bar", "x": [ - "service_issue:True" + "quality_issue:True" ], "xaxis": "x", "y": [ - 941 + 944 ], "yaxis": "y" }, @@ -343,7 +389,7 @@ ], "xaxis": "x", "y": [ - 932 + 936 ], "yaxis": "y" }, @@ -369,7 +415,7 @@ ], "xaxis": "x", "y": [ - 904 + 920 ], "yaxis": "y" }, @@ -395,7 +441,7 @@ ], "xaxis": "x", "y": [ - 723 + 725 ], "yaxis": "y" } @@ -1251,7 +1297,34 @@ } } } - } + }, + "text/html": [ + "
" + ] }, "metadata": {}, "output_type": "display_data" @@ -1260,20 +1333,20 @@ "source": [ "# Create example top attributes bar chart\n", "bar_chart, bar_chart_df = acd.get_bar_chart_fig(\n", - " selection=[], # Prefilter the dataset by adding attribute values here\n", + " selection=[], # Prefilter the dataset by adding attribute values here\n", " show_attributes=[],\n", " unit=\"Customer\",\n", " width=1000,\n", " height=600,\n", " scheme=color_schemes[\"Alphabet\"],\n", - " num_values=10\n", + " num_values=10,\n", ")\n", "bar_chart.show()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -1311,15 +1384,15 @@ ], "xaxis": "x", "y": [ - 12, - 18, - 24, - 49, + 7, 20, + 19, + 40, + 14, 77, - 331, - 351, - 13 + 328, + 356, + 17 ], "yaxis": "y" }, @@ -1351,14 +1424,14 @@ ], "xaxis": "x", "y": [ - 6, - 27, - 11, - 16, - 21, - 38, - 305, - 259, + 24, + 23, + 14, + 23, + 37, + 42, + 291, + 263, 6 ], "yaxis": "y" @@ -1391,15 +1464,15 @@ ], "xaxis": "x", "y": [ - 8, - 18, - 33, - 42, - 60, - 61, - 384, - 321, - 0 + 15, + 13, + 31, + 27, + 64, + 55, + 391, + 329, + 4 ], "yaxis": "y" }, @@ -1431,15 +1504,15 @@ ], "xaxis": "x", "y": [ - 13, - 32, - 36, - 20, - 61, + 16, + 44, + 26, + 21, 60, - 337, - 368, - 6 + 65, + 345, + 365, + 4 ], "yaxis": "y" }, @@ -1471,15 +1544,15 @@ ], "xaxis": "x", "y": [ - 9, - 43, - 11, - 50, - 39, - 37, - 318, - 368, - 11 + 13, + 54, + 33, + 68, + 47, + 34, + 325, + 359, + 26 ], "yaxis": "y" } @@ -2333,7 +2406,34 @@ } } } - } + }, + "text/html": [ + "
" + ] }, "metadata": {}, "output_type": "display_data" @@ -2342,20 +2442,26 @@ "source": [ "# Create example time series line chart\n", "line_chart, line_chart_df = acd.get_line_chart_fig(\n", - " selection=[], # Prefilter the dataset by adding attribute values here\n", - " series_attributes=[\"quality_issue\", \"price_issue\", \"service_issue\", \"delivery_issue\", \"description_issue\"],\n", + " selection=[], # Prefilter the dataset by adding attribute values here\n", + " series_attributes=[\n", + " \"quality_issue\",\n", + " \"price_issue\",\n", + " \"service_issue\",\n", + " \"delivery_issue\",\n", + " \"description_issue\",\n", + " ],\n", " time_attribute=\"period\",\n", " unit=\"Customer\",\n", " width=1000,\n", " height=600,\n", - " scheme=color_schemes[\"Alphabet\"]\n", + " scheme=color_schemes[\"Alphabet\"],\n", ")\n", "line_chart.show()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -2368,130 +2474,133 @@ { "link": { "color": [ - "#b6a5bf", - "#b3afb5", + "#b3aeb6", "#be89db", - "#b89fc5", - "#b6a4c0", - "#b99cc8", - "#b7a0c4", - "#bc90d4", - "#bc92d2", - "#ba97cd", - "#b99aca", - "#bf86de", - "#bb94d0", - "#b4adb7", - "#ba99cb", - "#bf88dc", - "#c476ee", - "#b99cc8", - "#b99aca", - "#b7a1c3", - "#b99aca", - "#b4aaba", + "#b5a7bd", + "#b6a3c1", + "#b3aeb6", + "#b5a8bc", + "#b6a5bf", "#b7a0c4", - "#bc90d4", - "#b89dc7", - "#b7a3c1", - "#b89fc5", - "#b4acb8", - "#b89ec6", - "#c085df", - "#b6a4c0", + "#bd8fd5", + "#b6a6be", + "#c083e1", + "#c180e4", + "#bc91d3", + "#b99bc9", + "#b5a9bb", + "#b2b0b4", + "#b2b2b2", + "#b2b2b2", "#b4abb9", - "#c475ef", - "#bc92d2", "#b2b2b2", - "#b89fc5", + "#bd8dd7", + "#b4abb9", + "#b99bc9", + "#bd8cd8", + "#ba96ce", + "#ba96ce", + "#c180e4", + "#c085df", + "#b7a0c4", "#b2b2b2", - "#c475ef", - "#bb93d1", - "#b7a2c2", - "#b6a6be", - "#ba99cb", - "#b99cc8", + "#b7a3c1", + "#bc90d4", "#b2b2b2", + "#b89dc7", + "#b7a0c4", "#bd8ed6", - "#c181e3", - "#c573f1", - "#b5aaba", - "#c181e3", - "#b3aeb6", - "#c27ee6", - "#bc91d3", - "#b89fc5", + "#bd8ed6", + "#b5a7bd", + "#c671f3", "#b2b2b2", "#bb93d1", + "#c27ee6", "#b2b2b2", - "#ba99cb", + "#c965ff", + "#bd8cd8", "#b2b2b2", "#b2b2b2", + "#bd8cd8", "#b2b2b2", + "#b5a9bb", + "#b99aca", "#b2b2b2", + "#b5a9bb", "#b2b2b2", + "#b5a8bc", + "#c084e0", + "#c181e3", + "#b5a7bd", + "#b4adb7", + "#b5a7bd", + "#c76cf8", + "#bd8cd8", "#b2b2b2", "#c965ff", + "#bc90d4", "#b2b2b2", - "#b6a6be", "#c965ff", - "#bd8cd8", - "#bb93d1", - "#c37be9", - "#b6a4c0", - "#c082e2", + "#c76df7", "#b2b2b2", "#b2b2b2", - "#be89db", + "#c475ef", "#b2b2b2", + "#ba99cb", + "#bf88dc", + "#c965ff", + "#b5aaba", + "#c965ff", + "#c965ff", + "#ba96ce", + "#ba96ce", "#c965ff", - "#b4aaba", - "#c084e0", "#b2b2b2", "#b2b2b2", - "#c379eb", "#c965ff", - "#c379eb", "#c965ff", "#b2b2b2", + "#bf86de", "#b2b2b2", "#c965ff", + "#c965ff", "#b2b2b2", + "#c084e0", + "#c965ff", "#b2b2b2", - "#c180e4", - "#b2b2b2", - "#b6a6be", "#b2b2b2", "#b2b2b2", - "#c965ff", - "#bb93d1", "#b2b2b2", "#b2b2b2" ], "customdata": [ [ - 20, - 0.17699115044247787 + 6, + 0.05454545454545454 ], [ - 5, - 0.046296296296296294 + 57, + 0.5428571428571428 ], [ - 55, - 0.5445544554455446 + 15, + 0.1485148514851485 ], [ - 19, - 0.25333333333333335 + 16, + 0.19753086419753085 ], [ - 13, - 0.18055555555555555 + 4, + 0.056338028169014086 ], [ - 16, - 0.2909090909090909 + 8, + 0.13114754098360656 + ], + [ + 10, + 0.1724137931034483 ], [ 13, @@ -2499,263 +2608,243 @@ ], [ 23, - 0.4423076923076923 + 0.46938775510204084 + ], + [ + 8, + 0.16666666666666666 ], [ - 21, - 0.42857142857142855 + 29, + 0.6170212765957447 ], [ - 16, - 0.35555555555555557 + 29, + 0.6590909090909091 ], [ - 14, - 0.3181818181818182 + 19, + 0.4318181818181818 ], [ - 24, - 0.5714285714285714 + 12, + 0.3 ], [ - 16, - 0.3902439024390244 + 5, + 0.125 ], [ - 3, - 0.07894736842105263 + 1, + 0.029411764705882353 ], [ - 12, - 0.3333333333333333 + 0, + 0 ], [ - 19, - 0.5588235294117647 + 0, + 0 ], [ - 27, - 0.7941176470588235 + 3, + 0.09375 ], [ - 10, - 0.29411764705882354 + 0, + 0 ], [ - 10, - 0.3225806451612903 + 15, + 0.4838709677419355 ], [ - 7, - 0.22580645161290322 + 3, + 0.0967741935483871 ], [ - 10, - 0.3225806451612903 + 9, + 0.3 ], [ - 3, - 0.10714285714285714 + 15, + 0.5 ], [ - 6, - 0.23076923076923078 + 11, + 0.36666666666666664 ], [ 11, - 0.44 + 0.3793103448275862 ], [ - 7, - 0.28 + 19, + 0.6551724137931034 ], [ - 5, - 0.2 + 16, + 0.5925925925925926 ], [ 6, - 0.25 + 0.23076923076923078 ], [ - 2, - 0.08695652173913043 + 0, + 0 ], [ - 6, - 0.2608695652173913 + 5, + 0.2 ], [ - 13, - 0.5909090909090909 + 11, + 0.4583333333333333 ], [ - 4, - 0.18181818181818182 + 0, + 0 ], [ - 2, - 0.09090909090909091 + 6, + 0.2727272727272727 ], [ - 17, - 0.8095238095238095 + 5, + 0.23809523809523808 ], [ - 9, - 0.42857142857142855 + 10, + 0.47619047619047616 ], [ - 0, - 0 + 10, + 0.47619047619047616 ], [ - 5, - 0.25 + 3, + 0.14285714285714285 + ], + [ + 17, + 0.85 ], [ 0, 0 ], - [ - 16, - 0.8 - ], [ 8, 0.4 ], [ - 4, - 0.21052631578947367 + 13, + 0.6842105263157895 ], [ - 3, - 0.16666666666666666 + 0, + 0 ], [ - 6, - 0.3333333333333333 + 18, + 1 ], [ - 5, - 0.29411764705882354 + 9, + 0.5 ], [ 0, 0 ], [ - 8, - 0.47058823529411764 + 0, + 0 ], [ - 11, - 0.6470588235294118 + 9, + 0.5 ], [ - 14, - 0.8235294117647058 + 0, + 0 ], [ 2, - 0.11764705882352941 - ], - [ - 11, - 0.6470588235294118 + 0.125 ], [ - 1, - 0.058823529411764705 - ], - [ - 11, - 0.6875 - ], - [ - 7, - 0.4375 - ], - [ - 4, - 0.25 + 5, + 0.3125 ], [ 0, 0 ], [ - 6, - 0.4 + 2, + 0.125 ], [ 0, 0 ], [ - 5, - 0.3333333333333333 + 2, + 0.13333333333333333 ], [ - 0, - 0 + 9, + 0.6 ], [ - 0, - 0 + 9, + 0.6428571428571429 ], [ - 0, - 0 + 2, + 0.15384615384615385 ], [ - 0, - 0 + 1, + 0.07692307692307693 ], [ - 0, - 0 + 2, + 0.15384615384615385 ], [ - 0, - 0 + 12, + 0.9230769230769231 ], [ - 12, - 1 + 6, + 0.5 ], [ 0, 0 ], [ - 2, - 0.16666666666666666 - ], - [ - 12, + 11, 1 ], - [ - 6, - 0.5 - ], [ 5, - 0.4166666666666667 + 0.45454545454545453 ], [ - 8, - 0.7272727272727273 + 0, + 0 ], [ - 2, - 0.18181818181818182 + 11, + 1 ], [ - 7, - 0.6363636363636364 + 10, + 0.9090909090909091 ], [ 0, @@ -2766,44 +2855,44 @@ 0 ], [ - 6, - 0.5454545454545454 + 8, + 0.8 ], [ 0, 0 ], [ - 10, - 1 + 3, + 0.3333333333333333 ], [ - 1, - 0.1 + 5, + 0.5555555555555556 ], [ - 6, - 0.6 + 9, + 1 ], [ - 0, - 0 + 1, + 0.1111111111111111 ], [ - 0, - 0 + 9, + 1 ], [ - 6, - 0.75 + 9, + 1 ], [ - 8, - 1 + 3, + 0.375 ], [ - 6, - 0.75 + 3, + 0.375 ], [ 8, @@ -2822,8 +2911,8 @@ 1 ], [ - 0, - 0 + 7, + 1 ], [ 0, @@ -2831,31 +2920,43 @@ ], [ 4, - 0.6666666666666666 + 0.5714285714285714 ], [ 0, 0 ], [ - 1, - 0.16666666666666666 + 6, + 1 ], [ - 0, - 0 + 5, + 1 ], [ 0, 0 ], + [ + 3, + 0.6 + ], [ 5, 1 ], [ - 2, - 0.4 + 0, + 0 + ], + [ + 0, + 0 + ], + [ + 0, + 0 ], [ 0, @@ -2868,275 +2969,267 @@ ], "hovertemplate": "city: %{source.label} + product_code: %{target.label} = %{value:.0f}
+ price_issue:True = %{customdata[0]}
Proportion = %{customdata[1]:.1%}", "source": [ - 11, 13, - 22, - 22, 20, - 13, - 22, - 13, - 16, 11, + 18, 20, - 13, 15, - 19, 13, - 9, - 19, - 24, 20, + 17, 11, - 2, + 13, + 13, + 13, + 18, 11, + 14, + 9, + 13, 11, 19, - 13, - 15, + 18, + 17, 21, - 13, - 12, - 19, 11, + 17, 2, - 2, - 24, - 18, - 18, - 13, + 17, 19, - 23, - 2, - 21, + 13, 12, - 20, - 9, - 21, + 12, + 13, + 14, + 7, 17, + 9, + 18, + 15, 7, - 20, - 16, + 19, 9, + 18, 21, + 18, + 2, 9, - 9, + 20, + 2, 12, + 11, + 2, 18, + 15, + 9, + 7, 14, + 15, 19, - 16, - 7, - 22, - 24, - 23, - 18, 19, - 12, - 23, + 15, + 16, 21, - 22, + 7, + 15, 17, + 2, + 16, + 2, + 21, + 15, + 11, + 16, 19, 16, 16, - 9, - 17, - 11, - 23, - 21, - 22, - 22, - 7, 14, - 7, - 20, - 24, - 21, + 18, + 19, 12, 14, - 9, - 7, - 14, - 24, - 18, + 17, 12, 14, + 21, + 9, 15, + 19, + 14, 2, - 21, - 18, - 7 + 11, + 9, + 20, + 9, + 2, + 12, + 16, + 20, + 20 ], "target": [ - 4, 3, 0, - 8, - 1, 4, + 1, + 8, 3, 1, 3, - 10, 0, - 0, - 5, 10, 8, - 5, + 4, + 0, 0, - 5, - 6, 3, - 6, 5, - 8, - 6, - 6, + 5, + 10, 8, 8, + 6, 10, 5, 5, 1, - 1, 4, - 6, - 0, + 8, 5, 5, - 8, - 6, - 0, - 10, 4, - 10, 6, - 3, + 6, 8, - 3, - 3, + 0, + 6, 1, + 4, + 0, 3, - 5, - 1, 10, - 8, + 3, 8, 6, - 4, - 5, 10, - 5, - 4, 8, 6, - 3, - 6, - 4, - 6, + 1, + 0, + 5, + 1, 6, + 5, 6, + 10, + 1, + 10, 1, 6, + 4, + 8, 8, + 0, + 10, 4, - 3, - 6, 5, + 5, + 6, 1, + 4, 10, - 1, - 1, + 6, + 5, + 0, 3, 0, + 4, + 3, + 3, 8, - 8, - 0, - 0, + 3, + 3, 1, - 8, 6, 10, 0, - 3, + 5, + 1, 1, - 8, 3, + 0, 8, + 10, 4, 10, + 3, + 1, + 4, 5 ], "value": [ - 113, - 108, + 110, + 105, 101, - 75, - 72, - 55, + 81, + 71, + 61, + 58, 55, - 52, 49, - 45, + 48, + 47, 44, - 42, - 41, - 38, - 36, - 34, - 34, + 44, + 40, + 40, 34, + 33, + 33, + 32, 31, 31, 31, - 28, + 30, + 30, + 30, + 29, + 29, + 27, + 26, 26, - 25, - 25, 25, 24, 23, - 23, - 22, - 22, 22, 21, 21, - 20, - 20, + 21, + 21, 20, 20, 20, 19, + 19, + 18, + 18, + 18, 18, 18, - 17, - 17, - 17, - 17, - 17, - 17, - 17, 17, 16, 16, 16, + 16, 15, 15, 15, - 15, - 15, - 14, - 14, 14, 13, 13, - 12, - 12, - 12, - 12, + 13, + 13, 12, 12, 11, @@ -3148,10 +3241,12 @@ 10, 10, 10, - 10, 9, - 8, - 8, + 9, + 9, + 9, + 9, + 9, 8, 8, 8, @@ -3159,15 +3254,18 @@ 7, 7, 7, + 7, + 7, 6, 6, - 6, - 6, - 6, 5, 5, 5, 5, + 5, + 5, + 4, + 4, 4 ] }, @@ -3188,16 +3286,13 @@ "Hilltop", "Hilltown", "Lakeside", - "Meadowland", "Meadowville", "Mountainview", - "Mountainville", "Riverbend", "Riverside", "Rivertown", "Seaside", "Springfield", - "Sunnyvale", "Valleyview" ], "line": { @@ -4036,7 +4131,34 @@ }, "width": 1000 } - } + }, + "text/html": [ + "
" + ] }, "metadata": {}, "output_type": "display_data" @@ -4045,14 +4167,14 @@ "source": [ "# Create example alluvial/flow chart of city-product relationships\n", "flow_chart, flow_chart_df = acd.get_flow_chart_fig(\n", - " selection=[], # Prefilter the dataset by adding attribute values here\n", + " selection=[], # Prefilter the dataset by adding attribute values here\n", " source_attribute=\"city\",\n", " target_attribute=\"product_code\",\n", " highlight_attribute=\"price_issue:True\",\n", " unit=\"Customer\",\n", " width=1000,\n", " height=600,\n", - " scheme=color_schemes[\"Alphabet\"]\n", + " scheme=color_schemes[\"Alphabet\"],\n", ")\n", "flow_chart.show()" ] @@ -4060,7 +4182,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-toolkit-lXFNld9n-py3.11", + "display_name": ".venv", "language": "python", "name": "python3" }, diff --git a/example_notebooks/compare_case_groups.ipynb b/example_notebooks/compare_case_groups.ipynb index da17ef6d..81652161 100644 --- a/example_notebooks/compare_case_groups.ipynb +++ b/example_notebooks/compare_case_groups.ipynb @@ -49,8 +49,9 @@ "source": [ "# Create the workflow object\n", "import os\n", + "from toolkit.helpers import df_functions\n", "from toolkit.AI.openai_configuration import OpenAIConfiguration\n", - "\n", + "import pandas as pd\n", "\n", "ccg = CompareCaseGroups()\n", "\n", @@ -64,7 +65,8 @@ "ccg.set_ai_configuration(ai_configuration)\n", "\n", "data_path = \"../example_outputs/compare_case_groups/customer_complaints/customer_complaints_prepared.csv\"\n", - "customer_cases = pl.read_csv(data_path)\n", + "customer_cases = pd.read_csv(data_path)\n", + "customer_cases = pl.from_pandas(df_functions.supress_boolean_binary(customer_cases))\n", "print(\"Loaded data\")" ] }, @@ -127,6 +129,26 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9646" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(ccg.model_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -138,7 +160,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 10)
citygroup_countgroup_rankattribute_valueattribute_countattribute_rankperiod_windowperiod_window_countperiod_window_rankperiod_window_delta
stru32i32stru32i32stru32i32i32
"Baytown"850"delivery_issue…649"2020-H1"140
"Baytown"850"delivery_issue…256"2020-H1"030
"Baytown"850"description_is…556"2020-H1"140
"Baytown"850"description_is…341"2020-H1"030
"Baytown"850"price_issue:fa…556"2020-H1"140
" + "shape: (5, 10)
citygroup_countgroup_rankattribute_valueattribute_countattribute_rankperiod_windowperiod_window_countperiod_window_rankperiod_window_delta
stru32i32stru32i32stru32i32i32
"Baytown"850"delivery_issue…256"2020-H1"030
"Baytown"850"description_is…341"2020-H1"030
"Baytown"850"price_issue:Tr…350"2020-H1"020
"Baytown"850"product_code:A…138"2020-H1"020
"Baytown"850"product_code:D…227"2020-H1"020
" ], "text/plain": [ "shape: (5, 10)\n", @@ -149,24 +171,22 @@ "│ ┆ u32 ┆ i32 ┆ str ┆ ┆ str ┆ --- ┆ i32 ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ ┆ i32 │\n", "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ Baytown ┆ 8 ┆ 50 ┆ delivery_ ┆ … ┆ 2020-H1 ┆ 1 ┆ 4 ┆ 0 │\n", - "│ ┆ ┆ ┆ issue:fal ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ se ┆ ┆ ┆ ┆ ┆ │\n", "│ Baytown ┆ 8 ┆ 50 ┆ delivery_ ┆ … ┆ 2020-H1 ┆ 0 ┆ 3 ┆ 0 │\n", - "│ ┆ ┆ ┆ issue:tru ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ┆ ┆ issue:Tru ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ e ┆ ┆ ┆ ┆ ┆ │\n", - "│ Baytown ┆ 8 ┆ 50 ┆ descripti ┆ … ┆ 2020-H1 ┆ 1 ┆ 4 ┆ 0 │\n", - "│ ┆ ┆ ┆ on_issue: ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ false ┆ ┆ ┆ ┆ ┆ │\n", "│ Baytown ┆ 8 ┆ 50 ┆ descripti ┆ … ┆ 2020-H1 ┆ 0 ┆ 3 ┆ 0 │\n", "│ ┆ ┆ ┆ on_issue: ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ true ┆ ┆ ┆ ┆ ┆ │\n", - "│ Baytown ┆ 8 ┆ 50 ┆ price_iss ┆ … ┆ 2020-H1 ┆ 1 ┆ 4 ┆ 0 │\n", - "│ ┆ ┆ ┆ ue:false ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ┆ ┆ True ┆ ┆ ┆ ┆ ┆ │\n", + "│ Baytown ┆ 8 ┆ 50 ┆ price_iss ┆ … ┆ 2020-H1 ┆ 0 ┆ 2 ┆ 0 │\n", + "│ ┆ ┆ ┆ ue:True ┆ ┆ ┆ ┆ ┆ │\n", + "│ Baytown ┆ 8 ┆ 50 ┆ product_c ┆ … ┆ 2020-H1 ┆ 0 ┆ 2 ┆ 0 │\n", + "│ ┆ ┆ ┆ ode:A ┆ ┆ ┆ ┆ ┆ │\n", + "│ Baytown ┆ 8 ┆ 50 ┆ product_c ┆ … ┆ 2020-H1 ┆ 0 ┆ 2 ┆ 0 │\n", + "│ ┆ ┆ ┆ ode:D ┆ ┆ ┆ ┆ ┆ │\n", "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -177,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -186,7 +206,7 @@ "'This table shows:\\n- A summary of all **2769** data records with values for all grouping attributes\\n- The **group_count** of records for all [**city**] groups, and corresponding **group_rank**\\n- The **attribute_count** of each **attribute_value** for all [**city**] groups, and corresponding **attribute_rank**\\n- The **period_window_count** of each **attribute_value** for each **period_window** for all [**city**] groups, and corresponding **period_window_rank**\\n- The **period_window_delta**, or change in the **attribute_value_count** for successive **period_window** values, within each [**city**] group'" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -197,76 +217,125 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Select groups to generate reports\n", "# By group name\n", - "selected_groups = [\"Lakeside\"]\n", + "selected_groups = [{\"city\": \"Lakeside\"}]\n", "# OR\n", "# By top n groups\n", "top_group_ranks = 10\n", "\n", - "report_data, filter_description = ccg.get_report_data(selected_groups=selected_groups)" + "report_data, filter_description = ccg.get_report_data(top_group_ranks=top_group_ranks)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "# Group Comparison Report: Lakeside\n", + "# Group Comparison Report\n", "\n", "## Introduction\n", "\n", - "This report focuses on the dataset filtered to include only the city group \"Lakeside.\" The dataset provides a comprehensive overview of various issues and product codes over different time periods, from the first half of 2020 to the second half of 2026. The analysis includes counts, ranks, and changes in these attributes over time.\n", + "This report provides a detailed comparison of the top 10 city groups based on record count from a dataset containing 2769 records. The dataset includes information on various issues and product codes across different time periods. The focus is on analyzing the trends and changes in these attributes over time for each city group.\n", + "\n", + "## Group Filters\n", "\n", - "## Data Summary\n", + "The dataset is filtered to include only the top 10 city groups by record count. These groups are:\n", "\n", - "The dataset consists of 349 records for the city group \"Lakeside.\" The analysis covers several attributes, including delivery issues, description issues, price issues, product codes, quality issues, and service issues. Each attribute is evaluated for its occurrence and rank within the group, as well as its changes over successive time periods.\n", + "1. Lakeside (349 records)\n", + "2. Springfield (265 records)\n", + "3. Hilltop (259 records)\n", + "4. Rivertown (204 records)\n", + "5. Riverside (184 records)\n", + "6. Seaside (127 records)\n", + "7. Mountainview (119 records)\n", + "8. Brookside (111 records)\n", + "9. Greenfield (104 records)\n", + "10. Meadowville (94 records)\n", "\n", "## Key Findings\n", "\n", - "### Delivery Issues\n", + "### Lakeside\n", + "\n", + "- **Record Count**: Lakeside has the highest record count with 349 records.\n", + "- **Top Issues**: Quality and service issues are the most frequent, each with 123 occurrences (rank 1).\n", + "- **Product Codes**: Product code C is the most common with 102 occurrences (rank 1).\n", + "- **Trends**: There was a significant increase in price issues in 2023-H1 (31 occurrences, delta +27) and a subsequent decrease in 2024-H1 (0 occurrences, delta -48).\n", + "\n", + "### Springfield\n", + "\n", + "- **Record Count**: Springfield ranks second with 265 records.\n", + "- **Top Issues**: Quality issues are the most frequent with 96 occurrences (rank 2).\n", + "- **Product Codes**: Product code A is the most common with 98 occurrences (rank 1).\n", + "- **Trends**: There was a notable increase in delivery issues in 2023-H1 (44 occurrences, delta +43) followed by a decrease in 2024-H1 (0 occurrences, delta -38).\n", + "\n", + "### Hilltop\n", + "\n", + "- **Record Count**: Hilltop ranks third with 259 records.\n", + "- **Top Issues**: Delivery issues are the most frequent with 104 occurrences (rank 1).\n", + "- **Product Codes**: Product code D is the most common with 107 occurrences (rank 1).\n", + "- **Trends**: There was a significant increase in service issues in 2023-H2 (55 occurrences, delta +39).\n", + "\n", + "### Rivertown\n", + "\n", + "- **Record Count**: Rivertown ranks fourth with 204 records.\n", + "- **Top Issues**: Quality issues are the most frequent with 92 occurrences (rank 3).\n", + "- **Product Codes**: Product code B is the most common with 79 occurrences (rank 1).\n", + "- **Trends**: There was a substantial increase in price issues in 2023-H1 (39 occurrences, delta +35) followed by a decrease in 2023-H2 (17 occurrences, delta -22).\n", + "\n", + "### Riverside\n", "\n", - "- **False Delivery Issues**: The count of records without delivery issues is consistently high, with a peak in the second half of 2023 (124 records, rank 1). There is a significant increase in the first half of 2023 (76 records, delta +63) compared to the previous period.\n", - "- **True Delivery Issues**: The count of records with delivery issues remains relatively low, with a notable increase in the second half of 2023 (40 records, rank 2, delta +22).\n", + "- **Record Count**: Riverside ranks fifth with 184 records.\n", + "- **Top Issues**: Price issues are the most frequent with 76 occurrences (rank 3).\n", + "- **Product Codes**: Product code A is the most common with 41 occurrences (rank 3).\n", + "- **Trends**: There was a notable increase in description issues in 2023-H1 (39 occurrences, delta +37).\n", "\n", - "### Description Issues\n", + "### Seaside\n", "\n", - "- **False Description Issues**: The records without description issues show a similar pattern to delivery issues, with a peak in the second half of 2023 (131 records, rank 1). The first half of 2023 also shows a substantial increase (76 records, delta +63).\n", - "- **True Description Issues**: The records with description issues increase in the second half of 2023 (33 records, rank 1, delta +15).\n", + "- **Record Count**: Seaside ranks sixth with 127 records.\n", + "- **Top Issues**: Service issues are the most frequent with 44 occurrences (rank 6).\n", + "- **Product Codes**: Product code G is the most common with 25 occurrences (rank 5).\n", + "- **Trends**: There was a significant increase in delivery issues in 2023-H1 (23 occurrences, delta +18).\n", "\n", - "### Price Issues\n", + "### Mountainview\n", "\n", - "- **False Price Issues**: The count of records without price issues peaks in the second half of 2023 (116 records, rank 1), with a significant increase in the first half of 2023 (63 records, delta +49).\n", - "- **True Price Issues**: The records with price issues also increase in the second half of 2023 (48 records, rank 1, delta +17).\n", + "- **Record Count**: Mountainview ranks seventh with 119 records.\n", + "- **Top Issues**: Delivery issues are the most frequent with 58 occurrences (rank 4).\n", + "- **Product Codes**: Product code C is the most common with 54 occurrences (rank 2).\n", + "- **Trends**: There was a substantial increase in description issues in 2023-H1 (32 occurrences, delta +28).\n", "\n", - "### Product Codes\n", + "### Brookside\n", "\n", - "- **Product Code A**: The count of records with product code A increases in the second half of 2023 (29 records, rank 1, delta +21).\n", - "- **Product Code B**: The records with product code B show a steady increase, peaking in the second half of 2023 (26 records, rank 1, delta +2).\n", - "- **Product Code C**: The count of records with product code C increases in the second half of 2023 (27 records, rank 2, delta +4).\n", - "- **Product Code D**: The records with product code D show a significant increase in the second half of 2023 (35 records, rank 2, delta +28).\n", + "- **Record Count**: Brookside ranks eighth with 111 records.\n", + "- **Top Issues**: Price issues are the most frequent with 45 occurrences (rank 6).\n", + "- **Product Codes**: Product code F is the most common with 23 occurrences (rank 4).\n", + "- **Trends**: There was a significant increase in price issues in 2023-H1 (27 occurrences, delta +27).\n", "\n", - "### Quality Issues\n", + "### Greenfield\n", "\n", - "- **False Quality Issues**: The records without quality issues peak in the second half of 2023 (86 records, rank 2, delta +26).\n", - "- **True Quality Issues**: The records with quality issues increase significantly in the second half of 2023 (78 records, rank 1, delta +44).\n", + "- **Record Count**: Greenfield ranks ninth with 104 records.\n", + "- **Top Issues**: Description issues are the most frequent with 38 occurrences (rank 5).\n", + "- **Product Codes**: Product code E is the most common with 25 occurrences (rank 4).\n", + "- **Trends**: There was a notable increase in description issues in 2023-H1 (20 occurrences, delta +19).\n", "\n", - "### Service Issues\n", + "### Meadowville\n", "\n", - "- **False Service Issues**: The count of records without service issues peaks in the second half of 2023 (117 records, rank 1, delta +47).\n", - "- **True Service Issues**: The records with service issues also increase significantly in the second half of 2023 (47 records, rank 2, delta +23).\n", + "- **Record Count**: Meadowville ranks tenth with 94 records.\n", + "- **Top Issues**: Price issues are the most frequent with 30 occurrences (rank 11).\n", + "- **Product Codes**: Product code E is the most common with 33 occurrences (rank 1).\n", + "- **Trends**: There was a significant increase in delivery issues in 2023-H2 (17 occurrences, delta +11).\n", "\n", "## Conclusion\n", "\n", - "The analysis of the Lakeside group reveals significant trends in various issues and product codes over time. Notably, there are substantial increases in records without delivery, description, and price issues in the first half of 2023, followed by peaks in the second half of 2023. Similarly, records with true issues also show notable increases during these periods. The product codes exhibit varying trends, with some codes showing significant increases in the latter half of 2023. Overall, the data provides valuable insights into the dynamics of issues and product codes within the Lakeside group over the analyzed periods.\n" + "The analysis reveals distinct patterns and trends in issues and product codes across the top 10 city groups. Lakeside and Springfield show the highest record counts, with quality and service issues being prevalent. Significant changes in issue occurrences are observed in various periods, indicating potential areas for further investigation and improvement.\n" ] } ], diff --git a/example_notebooks/detect_case_patterns.ipynb b/example_notebooks/detect_case_patterns.ipynb index 3ecfcae6..a18bd55f 100644 --- a/example_notebooks/detect_case_patterns.ipynb +++ b/example_notebooks/detect_case_patterns.ipynb @@ -20,13 +20,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from tqdm.autonotebook import tqdm, trange\n" ] } ], "source": [ "import sys\n", + "\n", "sys.path.append(\"..\")\n", "import os\n", "from toolkit.detect_case_patterns import DetectCasePatterns\n", @@ -50,6 +51,9 @@ ], "source": [ "# Create the workflow object\n", + "from toolkit.helpers import df_functions\n", + "\n", + "\n", "dcp = DetectCasePatterns()\n", "# Set the AI configuration\n", "ai_configuration = OpenAIConfiguration(\n", @@ -64,7 +68,7 @@ "data_path = \"../example_outputs/detect_case_patterns/customer_complaints/customer_complaints_prepared.csv\"\n", "case_data = pd.read_csv(data_path)\n", "# Map missing values and binary False to empty strings, since we only care about the presence of attributes\n", - "case_data = case_data.astype(str).replace(\"False\", \"\").replace(\"nan\", \"\")\n", + "case_data = df_functions.supress_boolean_binary(case_data)\n", "print(\"Loaded data\")" ] }, @@ -83,10 +87,7 @@ ], "source": [ "# Generate the graph model\n", - "dcp.generate_graph_model(\n", - " df=case_data,\n", - " period_col=\"period\"\n", - ")\n", + "dcp.generate_graph_model(df=case_data, period_col=\"period\")\n", "print(\"Generated graph model\")" ] }, @@ -124,10 +125,7 @@ ], "source": [ "# Detect the case patterns\n", - "dcp.detect_patterns(\n", - " min_pattern_count=10,\n", - " max_pattern_length=5\n", - ")\n", + "dcp.detect_patterns(min_pattern_count=10, max_pattern_length=5)\n", "print(\"Detected case patterns\")" ] }, @@ -219,6 +217,7 @@ "name": "stdout", "output_type": "stream", "text": [ + "Computing attribute counts for pattern: age_range:(50-60] & city:Springfield & delivery_issue:True & product_code:G & service_issue:True with period: 2023-H1 for period column: period\n", " AttributeValue Count\n", "0 age_range:(50-60] 18\n", "1 city:Springfield 18\n", @@ -235,7 +234,7 @@ "# Compute related attribute counts for the example pattern\n", "att_counts = dcp.compute_attribute_counts(\n", " selected_pattern=example_pattern[\"pattern\"],\n", - " selected_pattern_period=example_pattern[\"period\"]\n", + " selected_pattern_period=example_pattern[\"period\"],\n", ")\n", "print(att_counts)" ] @@ -249,9 +248,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\altair\\utils\\core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n", - "c:\\Users\\daedge\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\intelligence-toolkit-lXFNld9n-py3.11\\Lib\\site-packages\\altair\\utils\\core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", + "/home/ddesouza/Projects/intelligence-toolkit/.venv/lib/python3.11/site-packages/altair/utils/core.py:384: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.\n", " col = df[col_name].apply(to_list_if_array, convert_dtype=False)\n" ] }, @@ -259,13 +258,13 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "