diff --git a/python-snippet/bubble_chart_optbinning.ipynb b/python-snippet/bubble_chart_optbinning.ipynb
new file mode 100644
index 0000000..e9c6efb
--- /dev/null
+++ b/python-snippet/bubble_chart_optbinning.ipynb
@@ -0,0 +1,432 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "\n",
+ "# Tải dữ liệu German Credit\n",
+ "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data\"\n",
+ "column_names = [\"existing_account\", \"duration_month\", \"credit_history\", \"purpose\", \"credit_amount\",\n",
+ " \"savings_account\", \"employment_since\", \"installment_rate\", \"personal_status_sex\", \"other_debtors\",\n",
+ " \"present_residence\", \"property\", \"age\", \"other_installment_plans\", \"housing\", \"existing_credits\",\n",
+ " \"job\", \"people_liable\", \"telephone\", \"foreign_worker\", \"credit_risk\"]\n",
+ "\n",
+ "df = pd.read_csv(url, sep=\" \", header=None, names=column_names)\n",
+ "\n",
+ "# Chuyển đổi target thành nhị phân\n",
+ "df['credit_risk'] = df['credit_risk'].map({1: 0, 2: 1})\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.base import TransformerMixin, BaseEstimator\n",
+ "import logging\n",
+ "from optbinning import BinningProcess\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "class InformationValues(BaseEstimator, TransformerMixin):\n",
+ " def __init__(self, monotonic_trend='auto_asc_desc', max_n_bins=5, prebinning_method='cart', min_event_rate_diff=0.0002): \n",
+ " self.monotonic_trend = monotonic_trend \n",
+ " self.max_n_bins = max_n_bins\n",
+ " self.prebinning_method = prebinning_method\n",
+ " self.min_event_rate_diff = min_event_rate_diff\n",
+ " self.binning_process = None \n",
+ " self.t_iv = None\n",
+ " self.t_woe = None \n",
+ "\n",
+ " def fit(self, X, y): \n",
+ " self.feats = X.columns.tolist()\n",
+ " self.cat_feats = X.select_dtypes(include=['object']).columns.tolist() \n",
+ " # self.num_feats = list(set(self.feats) - set(self.cat_feats))\n",
+ " self.num_feats = [item for item in self.feats if item not in self.cat_feats]\n",
+ " logging.info('Processing Information Value, Total features {}. Categorical features {}. Numeric features {}'.format(len(self.feats), len(self.cat_feats), len(self.num_feats)))\n",
+ " # Set the binning fit parameters for each feature\n",
+ " _binning_fit_params = {}\n",
+ " for fs in self.feats:\n",
+ " _binning_fit_params[fs] = {'monotonic_trend':self.monotonic_trend,\n",
+ " 'max_n_bins': self.max_n_bins,\n",
+ " 'prebinning_method':self.prebinning_method,\n",
+ " 'min_event_rate_diff':self.min_event_rate_diff}\n",
+ " \n",
+ " # Initialize the BinningProcess with the specified variables and fit parameters \n",
+ " _binning_process = BinningProcess(variable_names = self.feats\n",
+ " ,categorical_variables = self.cat_feats \n",
+ " , binning_fit_params=_binning_fit_params)\n",
+ " \n",
+ " # Transform the training data using the BinningProcess\n",
+ " _binning_process.fit(X, y) \n",
+ " self.binning_process = _binning_process\n",
+ " \n",
+ " # Create a list to store woe tables and iv tables\n",
+ " t_iv, t_woe = [], []\n",
+ " for fs in self.binning_process.summary().name: \n",
+ " binning_table = self.binning_process.get_binned_variable(fs).binning_table \n",
+ " tbl = binning_table.build().assign(variable = fs)\n",
+ " # Get woe table\n",
+ " t_woe.append(tbl)\n",
+ " # Get iv table\n",
+ " tbl_iv = tbl[tbl.index == 'Totals'][['variable', 'IV']] \n",
+ " t_iv.append(tbl_iv)\n",
+ " \n",
+ " # Concatenate the list of woe tables and iv tables\n",
+ " self.t_woe = pd.concat(t_woe, axis=0)\n",
+ " self.t_iv = pd.concat(t_iv, axis=0) \n",
+ "\n",
+ " return self \n",
+ " \n",
+ " def transform(self, X):\n",
+ " X_woe = self.binning_process.transform(X)\n",
+ " return X_woe\n",
+ " \n",
+ " def get_feature_names_in(self):\n",
+ " return self.feats\n",
+ "\n",
+ " def get_feature_names_out(self):\n",
+ " return self.feats\n",
+ " \n",
+ " def get_table_woe(self):\n",
+ " return self.t_woe\n",
+ " \n",
+ " def get_table_iv(self):\n",
+ " return self.t_iv\n",
+ " \n",
+ " # New method to retrieve binning table for a specific feature\n",
+ " def get_binning_table(self, feature_name):\n",
+ " if feature_name in self.feats:\n",
+ " binning_table = self.binning_process.get_binned_variable(feature_name).binning_table\n",
+ " return binning_table.build()\n",
+ " else:\n",
+ " raise ValueError(f\"Feature '{feature_name}' not found in the data.\")\n",
+ "\n",
+ " def plot_bubble_feature(self, feature_name):\n",
+ " if feature_name not in self.feats:\n",
+ " raise ValueError(f\"Feature '{feature_name}' not found in the data.\")\n",
+ " \n",
+ " binning_table = self.get_binning_table(feature_name)\n",
+ " \n",
+ " # Extract necessary data for plotting\n",
+ " plot_data = binning_table[['Bin', 'Count', 'Event rate']].copy()\n",
+ " # Handle non-numeric and special bins correctly\n",
+ " def calculate_midpoint(bin_range):\n",
+ " try:\n",
+ " bounds = bin_range.strip('[]()').split(', ')\n",
+ " if bounds[0] == '-inf':\n",
+ " lower = float('-inf')\n",
+ " else:\n",
+ " lower = float(bounds[0])\n",
+ " if bounds[1] == 'inf':\n",
+ " upper = float('inf')\n",
+ " else:\n",
+ " upper = float(bounds[1])\n",
+ " return (lower + upper) / 2 if lower != float('-inf') and upper != float('inf') else lower if upper == float('inf') else upper\n",
+ " except:\n",
+ " return None # Return None for 'Special' or 'Missing' categories\n",
+ "\n",
+ " plot_data['MidPoint'] = plot_data['Bin'].apply(calculate_midpoint)\n",
+ " plot_data = plot_data.dropna(subset=['MidPoint']) # Remove rows where midpoint is None\n",
+ " plot_data['BubbleSize'] = plot_data['Count'] / plot_data['Count'].max() * 1000\n",
+ "\n",
+ " plt.figure(figsize=(10, 6))\n",
+ " plt.scatter(plot_data['MidPoint'], plot_data['Event rate'], s=plot_data['BubbleSize'], alpha=0.5, color='pink')\n",
+ " \n",
+ " for index, row in plot_data.iterrows():\n",
+ " plt.text(row['MidPoint'], row['Event rate'], f\"{row['Event rate'] * 100:.2f}%\", ha='center', va='center')\n",
+ "\n",
+ " plt.xlabel('Bin Mid-Point')\n",
+ " plt.ylabel('Event Rate')\n",
+ " plt.title(f'Bubble Plot of Event Rate by Bin for {feature_name}')\n",
+ " plt.grid(True)\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ROC AUC Score: 0.8022\n",
+ "Information Value Table\n",
+ " variable IV\n",
+ "Totals existing_account 0.587325\n",
+ "Totals duration_month 0.257173\n",
+ "Totals credit_history 0.368192\n",
+ "Totals purpose 0.229094\n",
+ "Totals credit_amount 0.182718\n",
+ "Totals savings_account 0.162927\n",
+ "Totals employment_since 0.066179\n",
+ "Totals installment_rate 0.011137\n",
+ "Totals personal_status_sex 0.045733\n",
+ "Totals other_debtors 0.008107\n",
+ "Totals present_residence 0.007464\n",
+ "Totals property 0.145759\n",
+ "Totals age 0.157060\n",
+ "Totals other_installment_plans 0.034371\n",
+ "Totals housing 0.110042\n",
+ "Totals existing_credits 0.016714\n",
+ "Totals job 0.009923\n",
+ "Totals people_liable 0.000045\n",
+ "Totals telephone 0.006171\n",
+ "Totals foreign_worker 0.000000\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from optbinning import BinningProcess\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "\n",
+ "# Assuming you already have the 'df' data loaded\n",
+ "\n",
+ "# Initialize the custom transformer\n",
+ "iv_transformer = InformationValues(monotonic_trend='auto_asc_desc', max_n_bins=5, prebinning_method='cart')\n",
+ "\n",
+ "# Split the data\n",
+ "X = df.drop('credit_risk', axis=1)\n",
+ "y = df['credit_risk']\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
+ "\n",
+ "# Fit the transformer on the training data\n",
+ "iv_transformer.fit(X_train, y_train)\n",
+ "\n",
+ "# Transform both training and test data using WoE\n",
+ "X_train_woe = iv_transformer.transform(X_train)\n",
+ "X_test_woe = iv_transformer.transform(X_test)\n",
+ "\n",
+ "# Fit the logistic regression model\n",
+ "logreg = LogisticRegression(solver='liblinear')\n",
+ "logreg.fit(X_train_woe, y_train)\n",
+ "\n",
+ "# Predict and evaluate the model\n",
+ "y_pred_prob = logreg.predict_proba(X_test_woe)[:, 1]\n",
+ "roc_auc = roc_auc_score(y_test, y_pred_prob)\n",
+ "print(f\"ROC AUC Score: {roc_auc:.4f}\")\n",
+ "\n",
+ "# Get WoE and IV tables\n",
+ "woe_table = iv_transformer.get_table_woe()\n",
+ "iv_table = iv_transformer.get_table_iv()\n",
+ "\n",
+ "print(\"Information Value Table\")\n",
+ "print(iv_table)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Bin | \n",
+ " Count | \n",
+ " Count (%) | \n",
+ " Non-event | \n",
+ " Event | \n",
+ " Event rate | \n",
+ " WoE | \n",
+ " IV | \n",
+ " JS | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " (-inf, 735.50) | \n",
+ " 36 | \n",
+ " 0.051429 | \n",
+ " 29 | \n",
+ " 7 | \n",
+ " 0.194444 | \n",
+ " 0.567276 | \n",
+ " 0.014505 | \n",
+ " 0.001789 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " [735.50, 3913.50) | \n",
+ " 473 | \n",
+ " 0.675714 | \n",
+ " 352 | \n",
+ " 121 | \n",
+ " 0.255814 | \n",
+ " 0.213731 | \n",
+ " 0.029486 | \n",
+ " 0.003679 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [3913.50, 7839.50) | \n",
+ " 129 | \n",
+ " 0.184286 | \n",
+ " 83 | \n",
+ " 46 | \n",
+ " 0.356589 | \n",
+ " -0.263911 | \n",
+ " 0.013473 | \n",
+ " 0.001679 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [7839.50, inf) | \n",
+ " 62 | \n",
+ " 0.088571 | \n",
+ " 27 | \n",
+ " 35 | \n",
+ " 0.564516 | \n",
+ " -1.113621 | \n",
+ " 0.125254 | \n",
+ " 0.014895 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Special | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Missing | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " Totals | \n",
+ " | \n",
+ " 700 | \n",
+ " 1.000000 | \n",
+ " 491 | \n",
+ " 209 | \n",
+ " 0.298571 | \n",
+ " | \n",
+ " 0.182718 | \n",
+ " 0.022042 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Bin Count Count (%) Non-event Event Event rate \\\n",
+ "0 (-inf, 735.50) 36 0.051429 29 7 0.194444 \n",
+ "1 [735.50, 3913.50) 473 0.675714 352 121 0.255814 \n",
+ "2 [3913.50, 7839.50) 129 0.184286 83 46 0.356589 \n",
+ "3 [7839.50, inf) 62 0.088571 27 35 0.564516 \n",
+ "4 Special 0 0.000000 0 0 0.000000 \n",
+ "5 Missing 0 0.000000 0 0 0.000000 \n",
+ "Totals 700 1.000000 491 209 0.298571 \n",
+ "\n",
+ " WoE IV JS \n",
+ "0 0.567276 0.014505 0.001789 \n",
+ "1 0.213731 0.029486 0.003679 \n",
+ "2 -0.263911 0.013473 0.001679 \n",
+ "3 -1.113621 0.125254 0.014895 \n",
+ "4 0.0 0.000000 0.000000 \n",
+ "5 0.0 0.000000 0.000000 \n",
+ "Totals 0.182718 0.022042 "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "iv_transformer.get_binning_table('credit_amount')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "