From ae23378d99a8c4bbac8366137869b6947d4843e7 Mon Sep 17 00:00:00 2001 From: itlubber <1830611168@qq.com> Date: Wed, 14 Aug 2024 14:04:10 +0800 Subject: [PATCH] fix feature selection methods --- scorecardpipeline/feature_selection.py | 43 ++++++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/scorecardpipeline/feature_selection.py b/scorecardpipeline/feature_selection.py index 0147b6e..c8e133a 100644 --- a/scorecardpipeline/feature_selection.py +++ b/scorecardpipeline/feature_selection.py @@ -5,7 +5,7 @@ @Site : itlubber.art """ -import operator as op +import operator import sys import types from copy import deepcopy @@ -31,7 +31,7 @@ from sklearn.utils import _safe_indexing, check_X_y from sklearn.model_selection import StratifiedKFold, GroupKFold from sklearn.utils.sparsefuncs import mean_variance_axis, min_max_axis -from sklearn.utils.validation import check_is_fitted, check_array, indexable +from sklearn.utils.validation import check_is_fitted, check_array, indexable, column_or_1d from sklearn.base import BaseEstimator, TransformerMixin, clone, is_classifier, MetaEstimatorMixin from sklearn.feature_selection import RFECV, RFE, SelectFromModel, SelectKBest from sklearn.feature_selection._from_model import _calculate_threshold, _get_feature_importances @@ -42,6 +42,12 @@ class SelectorMixin(BaseEstimator, TransformerMixin): + def __init__(self): + self.select_columns = None + self.scores_ = None + self.dropped = None + self.n_features_in_ = None + def transform(self, x): check_is_fitted(self, "select_columns") return x[[col for col in self.select_columns if col in x.columns]] @@ -50,9 +56,14 @@ def __call__(self, *args, **kwargs): self.fit(*args, **kwargs) return self.select_columns + def fit(self, x, y=None): + pass + class TypeSelector(SelectorMixin): + def __init__(self, dtype_include=None, dtype_exclude=None, exclude=None): + super().__init__() self.dtype_include = dtype_include self.dtype_exclude = dtype_exclude self.exclude = exclude @@ -85,6 +96,7 @@ def fit(self, x: pd.DataFrame, y=None, **fit_params): class RegexSelector(SelectorMixin): def __init__(self, pattern=None, exclude=None): + super().__init__() self.pattern = pattern self.exclude = exclude @@ -130,6 +142,7 @@ def mode_ratio(x, dropna=True): class NanSelector(SelectorMixin): def __init__(self, threshold=0.95, missing_values=np.nan, exclude=None, **kwargs): + super().__init__() self.exclude = exclude self.threshold = threshold self.missing_values = missing_values @@ -161,6 +174,7 @@ def fit(self, x: pd.DataFrame, y=None): class ModeSelector(SelectorMixin): def __init__(self, threshold=0.95, exclude=None, dropna=True, n_jobs=None, **kwargs): + super().__init__() self.dropna = dropna self.exclude = exclude self.threshold = threshold @@ -202,6 +216,7 @@ class CardinalitySelector(SelectorMixin): >>> cs.fit_transform(x) """ def __init__(self, threshold=10, exclude=None, dropna=True): + super().__init__() self.exclude = exclude self.threshold = threshold self.dropna = dropna @@ -261,6 +276,7 @@ def _IV(x, y, regularization=1.0, n_jobs=None): class InformationValueSelector(SelectorMixin): def __init__(self, threshold=0.02, target="target", regularization=1.0, methods=None, n_jobs=None, **kwargs): + super().__init__() self.dropped = None self.select_columns = None self.scores_ = None @@ -351,6 +367,7 @@ class LiftSelector(SelectorMixin): Lift scores of features. """ def __init__(self, target="target", threshold=3.0, n_jobs=None, methods=None, **kwargs): + super().__init__() self.threshold = threshold self.n_jobs = n_jobs self.target = target @@ -386,6 +403,7 @@ class VarianceSelector(SelectorMixin): """Feature selector that removes all low-variance features.""" def __init__(self, threshold=0.0, exclude=None): + super().__init__() self.threshold = threshold if exclude is not None: self.exclude = exclude if isinstance(exclude, (list, np.ndarray)) else [exclude] @@ -412,7 +430,7 @@ def fit(self, x, y=None): if np.all(~np.isfinite(scores) | (scores <= self.threshold)): msg = "No feature in x meets the variance threshold {0:.5f}" - if X.shape[0] == 1: + if x.shape[0] == 1: msg += " (x contains only one sample)" raise ValueError(msg.format(self.threshold)) @@ -444,6 +462,7 @@ def __init__(self, threshold=4.0, exclude=None, missing=-1, n_jobs=None): :param missing: 缺失值默认填充 -1 :param n_jobs: 线程数 """ + super().__init__() self.threshold = threshold self.missing = missing self.n_jobs = n_jobs @@ -471,6 +490,7 @@ def fit(self, x: pd.DataFrame, y=None): class CorrSelector(SelectorMixin): def __init__(self, threshold=0.7, method="pearson", weights=None, exclude=None, **kwargs): + super().__init__() self.threshold = threshold self.method = method self.weights = weights @@ -572,6 +592,7 @@ def PSI(train, test, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): class PSISelector(SelectorMixin): def __init__(self, threshold=0.1, cv=None, method=None, exclude=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs', **kwargs): + super().__init__() self.threshold = threshold self.cv = cv self.method = method @@ -622,6 +643,7 @@ def fit(self, x: pd.DataFrame, y=None, groups=None): class NullImportanceSelector(SelectorMixin): def __init__(self, estimator, target="target", threshold=1.0, norm_order=1, importance_getter='auto', cv=3, n_runs=5, **kwargs): + super().__init__() self.estimator = estimator self.threshold = threshold self.norm_order = norm_order @@ -766,10 +788,10 @@ class ExhaustiveSelector(SelectorMixin, MetaEstimatorMixin): ----------- >>> from sklearn.neighbors import KNeighborsClassifier >>> from sklearn.datasets import load_iris - >>> from scorecardpipeline.feature_selection.exhaustive_feature_selector import ExhaustiveFeatureSelector + >>> from scorecardpipeline.feature_selection import ExhaustiveSelector >>> X, y = load_iris(return_X_y=True, as_frame=True) >>> knn = KNeighborsClassifier(n_neighbors=3) - >>> efs = ExhaustiveFeatureSelector(knn, min_features=1, max_features=4, cv=3) + >>> efs = ExhaustiveSelector(knn, min_features=1, max_features=4, cv=3) >>> efs.fit(X, y) ExhaustiveFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3), max_features=4) >>> efs.best_score_ @@ -778,6 +800,7 @@ class ExhaustiveSelector(SelectorMixin, MetaEstimatorMixin): 12 """ def __init__(self, estimator, min_features=1, max_features=1, scoring="accuracy", cv=3, verbose=0, n_jobs=None, pre_dispatch='2*n_jobs'): + super().__init__() self.estimator = estimator self.min_features = min_features self.max_features = max_features @@ -788,7 +811,7 @@ def __init__(self, estimator, min_features=1, max_features=1, scoring="accuracy" self.pre_dispatch = pre_dispatch def _validate_params(self, x, y): - check_X_y(X, y, estimator=self.estimator) + check_X_y(x, y, estimator=self.estimator) _, n_features = x.shape if not isinstance(self.min_features, int) or (self.max_features > n_features or self.max_features < 1): raise AttributeError("max_features must be smaller than %d and larger than 0" % (n_features + 1)) @@ -808,7 +831,7 @@ def _calc_score(estimator, x, y, indices, groups=None, scoring=None, cv=None, ** if cv is None: try: estimator.fit(x, y, **fit_params) - except Exception: + except: scores = np.nan else: scores = _score(estimator, x, y, scoring) @@ -914,7 +937,7 @@ class BorutaSelector(SelectorMixin): def __init__(self): # 对原始特征进行复制一份,并且将其按行进行随机打乱,称为Shadow Feature。将Shadow Feature与原始特征Real Feature进行横向拼接在一起,使用某种模型(随机森林、GBDT)进行计算特征重要性。将Shadow Feature中重要性最高的值为基准,删除Real Feature中重要性低于其的特征。多重复几个迭代。(一般来说随机生成的特征效果不如原始的,因此可以以Shadow Feature的特征重要性作为基准来判断Real Feature的好坏) - pass + super().__init__() class MICSelector(SelectorMixin): @@ -937,5 +960,5 @@ class SequentialFeatureSelector(SelectorMixin): pass -class SelectFromModel(SelectorMixin): - pass +# class SelectFromModel(SelectorMixin): +# pass