From 232b8a81149bb02c2b68ea168b97ddae74b10b87 Mon Sep 17 00:00:00 2001
From: itlubber <1830611168@qq.com>
Date: Fri, 2 Aug 2024 00:41:39 +0800
Subject: [PATCH 1/2] add ExhaustiveSelector method

---
 scorecardpipeline/feature_selection.py | 231 ++++++++++++++++++++++++-
 1 file changed, 225 insertions(+), 6 deletions(-)

diff --git a/scorecardpipeline/feature_selection.py b/scorecardpipeline/feature_selection.py
index 8663f28..afa03e9 100644
--- a/scorecardpipeline/feature_selection.py
+++ b/scorecardpipeline/feature_selection.py
@@ -5,25 +5,34 @@
 @Site    : itlubber.art
 """
 
+import operator as op
+import sys
+import types
+from copy import deepcopy
+from functools import reduce
+from itertools import chain, combinations
 from functools import partial
 from abc import ABCMeta, abstractmethod
 
 import math
 import numpy as np
 import pandas as pd
-from copy import deepcopy
 from joblib import Parallel, delayed
 
-from sklearn.utils import _safe_indexing
+from scipy.stats import sem
+from scipy.stats._continuous_distns import t
+from sklearn.metrics import check_scoring, get_scorer
+from sklearn.model_selection._validation import cross_val_score, _score
 from sklearn.utils._encode import _unique
 from sklearn.utils._mask import _get_mask
 from sklearn.model_selection import check_cv
 from sklearn.preprocessing import LabelEncoder
 from sklearn.linear_model import LinearRegression
+from sklearn.utils import _safe_indexing, check_X_y
 from sklearn.model_selection import StratifiedKFold, GroupKFold
 from sklearn.utils.sparsefuncs import mean_variance_axis, min_max_axis
 from sklearn.utils.validation import check_is_fitted, check_array, indexable
-from sklearn.base import BaseEstimator, TransformerMixin, clone, is_classifier
+from sklearn.base import BaseEstimator, TransformerMixin, clone, is_classifier, MetaEstimatorMixin
 from sklearn.feature_selection import RFECV, RFE, SelectFromModel, SelectKBest
 from sklearn.feature_selection._from_model import _calculate_threshold, _get_feature_importances
 # from statsmodels.stats.outliers_influence import variance_inflation_factor
@@ -699,11 +708,209 @@ def __init__(self, estimator, target="target", threshold=1.0, norm_order=1, impo
         super().__init__(estimator, target=target, threshold=threshold, norm_order=norm_order, importance_getter=importance_getter, cv=cv, n_runs=n_runs, **kwargs)
 
 
-class BorutaSelector(SelectorMixin):
-    pass
+class ExhaustiveSelector(SelectorMixin, MetaEstimatorMixin):
+    """Exhaustive Feature Selection for Classification and Regression.
+
+    Parameters
+    -----------
+    estimator : scikit-learn classifier or regressor
+
+    min_features : int (default: 1)
+        Minimum number of features to select
+
+    max_features : int (default: 1)
+        Maximum number of features to select
+
+    verbose : bool (default: True)
+        Prints progress as the number of epochs to stdout.
+
+    scoring : str, (default='_passthrough_scorer')
+        Scoring metric in faccuracy, f1, precision, recall, roc_auc) for classifiers,
+        {'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'r2'} for regressors,
+        or a callable object or function with signature ``scorer(estimator, X, y)``.
+
+    cv : int (default: 5)
+        Scikit-learn cross-validation generator or `int`,
+        If estimator is a classifier (or y consists of integer class labels), stratified k-fold is performed, and regular k-fold cross-validation otherwise.
+        No cross-validation if cv is None, False, or 0.
+
+    n_jobs : int (default: 1)
+        The number of CPUs to use for evaluating different feature subsets in parallel. -1 means 'all CPUs'.
+
+    pre_dispatch : int, or string (default: '2*n_jobs')
+        Controls the number of jobs that get dispatched during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
+        Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process.
+        This parameter can be:
+            None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
+            An int, giving the exact number of total jobs that are spawned
+            A string, giving an expression as a function of n_jobs, as in `2*n_jobs
+
+    Attributes
+    -----------
+    subset_info_ : list of dicts
+        A list of dictionary with the following keys:
+            'support_mask', mask array of the selected features
+            'cv_scores', cross validate scores
+
+    support_mask_ : array-like of booleans
+        Array of final chosen features
+
+    best_idx_ : array-like, shape = [n_predictions]
+        Feature Indices of the selected feature subsets.
+    best_score_ : float
+        Cross validation average score of the selected subset.
+    best_feature_indices_ : array-like, shape = (n_features,)
+        Feature indices of the selected feature subsets.
+
+    Examples
+    -----------
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from scorecardpipeline.feature_selection.exhaustive_feature_selector import ExhaustiveFeatureSelector
+    >>> X, y = load_iris(return_X_y=True, as_frame=True)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> efs = ExhaustiveFeatureSelector(knn, min_features=1, max_features=4, cv=3)
+    >>> efs.fit(X, y)
+    ExhaustiveFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3), max_features=4)
+    >>> efs.best_score_
+    0.9733333333333333
+    >>> efs.best_idx_
+    12
+    """
+    def __init__(self, estimator, min_features=1, max_features=1, scoring="accuracy", cv=3, verbose=0, n_jobs=None, pre_dispatch='2*n_jobs'):
+        self.estimator = estimator
+        self.min_features = min_features
+        self.max_features = max_features
+        self.scoring = scoring
+        self.cv = cv
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.pre_dispatch = pre_dispatch
+    
+    def _validate_params(self, x, y):
+        check_X_y(X, y, estimator=self.estimator)
+        _, n_features = x.shape
+        if not isinstance(self.min_features, int) or (self.max_features > n_features or self.max_features < 1):
+            raise AttributeError("max_features must be smaller than %d and larger than 0" % (n_features + 1))
+        if not isinstance(self.min_features, int) or (self.min_features > n_features or self.min_features < 1):
+            raise AttributeError("min_features must be smaller than %d and larger than 0" % (n_features + 1))
+        
+        if self.max_features < self.min_features:
+            raise AttributeError("min_features must be less equal than max_features")
+        return x, y
+    
+    @staticmethod
+    def _calc_score(estimator, x, y, indices, groups=None, scoring=None, cv=None, **fit_params):
+        _, n_features = x.shape
+        mask = np.in1d(np.arange(n_features), indices)
+        x = x[:, mask]
+        
+        if cv is None:
+            try:
+                estimator.fit(x, y, **fit_params)
+            except Exception:
+                scores = np.nan
+            else:
+                scores = _score(estimator, x, y, scoring)
+            
+            scores = np.asarray([scores], dtype=np.float64)
+        else:
+            scores = cross_val_score(estimator, x, y, groups=groups, cv=cv, scoring=scoring, n_jobs=None, pre_dispatch='2*n_jobs', error_score=np.nan, fit_params=fit_params)
+        
+        return mask, scores
+
+    @staticmethod
+    def ncr(n, r):
+        """Return the number of combinations of length r from n items.
+
+        Parameters
+        -----------
+        n : int
+            Total number of items
+        r : int
+            Number of items to select from n
+
+        Returns
+        -----------
+        Number of combinations, integer
+        """
+        r = min(r, n - r)
+        if r == 0:
+            return 1
+        numerator = reduce(operator.mul, range(n, n - r, -1))
+        denominator = reduce(operator.mul, range(1, r + 1))
+        return numerator // denominator
+
+    @staticmethod
+    def _calc_confidence(scores, confidence=0.95):
+        std_err = sem(scores)
+        bound = std_err * t._ppf((1 + confidence) / 2.0, len(scores))
+        return bound, std_err
+
+    def fit(self, X, y, groups=None, **fit_params):
+        """Perform feature selection and learn model from training data.
+
+        Parameters
+        -----------
+        X : array-like of shape (n_samples, n_features)
+        y : array-like of shape (n_samples, )
+            Target values.
+        groups : array-like of shape (n_samples,)
+            Group labels for the samples used while splitting the dataset into train/test set. Passed to the fit method of the cross-validator.
+        fit_params : dict
+            Parameters to pass to the fit method of classifier
+
+        Returns
+        -----------
+        self : ExhaustiveFeatureSelector
+        """
+        X, y = self._validate_params(X, y)
+        _, n_features = X.shape
+        min_features, max_features = self.min_features, self.max_features
+        candidates = chain.from_iterable(combinations(range(n_features), r=i) for i in range(min_features, max_features + 1))
+        # chain has no __len__ method
+        n_combinations = sum(self.ncr(n=n_features, r=i) for i in range(min_features, max_features + 1))
+
+        estimator = self.estimator
+        scoring = check_scoring(estimator, self.scoring)
+        cv = self.cv
+        n_jobs = self.n_jobs
+        pre_dispatch = self.pre_dispatch
+        parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch)
+        work = enumerate(parallel(delayed(self._calc_score)(clone(estimator), X, y, c, groups=groups, scoring=scoring, cv=cv, **fit_params) for c in candidates))
+        
+        subset_info = []
+        append_subset_info = subset_info.append
+        try:
+            for iteration, (mask, cv_scores) in work:
+                avg_score = np.nanmean(cv_scores).item()
+                append_subset_info({"support_mask": mask, "cv_scores": cv_scores, "avg_score": avg_score})
+                if self.verbose:
+                    print("Feature set: %d/%d, avg score: %.3f" % (iteration + 1, n_combinations, avg_score))
+        except KeyboardInterrupt:
+            print("Stopping early due to keyboard interrupt...")
+        finally:
+            max_score = float("-inf")
+            best_idx, best_info = -1, {}
+            for i, info in enumerate(subset_info):
+                if info["avg_score"] > max_score:
+                    max_score = info["avg_score"]
+                    best_idx, best_info = i, info
+            score = max_score
+            mask = best_info["support_mask"]
+            self.subset_info_ = subset_info
+            self.support_mask_ = mask
+            self.best_idx_ = best_idx
+            self.best_score_ = score
+            self.best_feature_indices_ = np.where(mask)[0]
+            return self
+    
+    def _get_support_mask(self):
+        check_is_fitted(self, "support_mask_")
+        return self.support_mask_
 
 
-class ExhaustiveSelector(SelectorMixin):
+class BorutaSelector(SelectorMixin):
     pass
 
 
@@ -717,3 +924,15 @@ class FeatureImportanceSelector(SelectorMixin):
 
 class StabilitySelector(SelectorMixin):
     pass
+
+
+class REFSelector(SelectorMixin):
+    pass
+
+
+class SequentialFeatureSelector(SelectorMixin):
+    pass
+
+
+class SelectFromModel(SelectorMixin):
+    pass

From d97cc27f64f40c09c9fad4fb70fd76b846a47197 Mon Sep 17 00:00:00 2001
From: itlubber <1830611168@qq.com>
Date: Sun, 11 Aug 2024 15:17:28 +0800
Subject: [PATCH 2/2] add rule save method

---
 scorecardpipeline/rule.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scorecardpipeline/rule.py b/scorecardpipeline/rule.py
index ca83f1f..b246f4c 100644
--- a/scorecardpipeline/rule.py
+++ b/scorecardpipeline/rule.py
@@ -14,6 +14,7 @@
 from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
 
 from .processing import feature_bin_stats, Combiner
+from .excel_writer import dataframe2excel
 
 
 def _get_context(X, feature_names):
@@ -404,3 +405,24 @@ def __invert__(self):
         r.result_ = np.logical_not(self.result())
         r._state = RuleState.APPLIED
         return r
+
+    @staticmethod
+    def save(report, excel_writer, sheet_name=None, merge_column=None, percent_cols=None, condition_cols=None, custom_cols=None, custom_format="#,##0", color_cols=None, start_col=2, start_row=2, **kwargs):
+        """保存规则结果至excel中，参数与 https://scorecardpipeline.itlubber.art/scorecardpipeline.html#scorecardpipeline.dataframe2excel 一致
+        """
+        if merge_column:
+            merge_column = [c for c in report.columns if (isinstance(c, tuple) and c[-1] in merge_column) or (not isinstance(c, tuple) and c in merge_column)]
+        
+        if percent_cols:
+            percent_cols = [c for c in report.columns if (isinstance(c, tuple) and c[-1] in percent_cols) or (not isinstance(c, tuple) and c in percent_cols)]
+            
+        if condition_cols:
+            condition_cols = [c for c in report.columns if (isinstance(c, tuple) and c[-1] in condition_cols) or (not isinstance(c, tuple) and c in condition_cols)]
+        
+        if custom_cols:
+            custom_cols = [c for c in report.columns if (isinstance(c, tuple) and c[-1] in custom_cols) or (not isinstance(c, tuple) and c in custom_cols)]
+        
+        if color_cols:
+            color_cols = [c for c in report.columns if (isinstance(c, tuple) and c[-1] in color_cols) or (not isinstance(c, tuple) and c in color_cols)]
+        
+        dataframe2excel(report, excel_writer, sheet_name=sheet_name, merge_column=merge_column, percent_cols=percent_cols, condition_cols=condition_cols, custom_cols=custom_cols, custom_format=custom_format, color_cols=color_cols, start_col=start_col, start_row=start_row, **kwargs)