From a2b0c9c42080e8a5fb705d9b0022e8bc715a0872 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Fri, 15 Apr 2022 18:08:54 -0700 Subject: [PATCH 01/29] initial commit --- feature_engine/discretisation/target_mean.py | 95 ++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 feature_engine/discretisation/target_mean.py diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py new file mode 100644 index 000000000..f2a40bdae --- /dev/null +++ b/feature_engine/discretisation/target_mean.py @@ -0,0 +1,95 @@ +import warnings +from typing import Dict, List, Optional, Union + +import pandas as pd + +from feature_engine.discretisation.base_discretiser import BaseDiscretiser +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring +) +from feature_engine._docstrings.fit_attributes import ( + _variables_attribute_docstring, + _feature_names_in_docstring, + _n_features_in_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.tags import _return_tags + + +@Substitution( + return_objects=BaseDiscretiser._return_object_docstring, + return_boundaries=BaseDiscretiser._return_boundaries_docstring, + binner_dict_=BaseDiscretiser._binner_dict_docstring, + transform=BaseDiscretiser._transform_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit=_fit_not_learn_docstring, + fit_transform=_fit_transform_docstring, +) +class TargetMeanDiscretiser(BaseDiscretiser): + """ + + Parameters + ---------- + binning_dict: dict + The dictionary with the variable to interval limits pairs. + + {return_object} + + {return_boundaries} + + errors: string, default='ignore' + Indicates what to do when a value is outside the limits indicated in the + 'binning_dict'. If 'raise', the transformation will raise an error. + If 'ignore', values outside the limits are returned as NaN + and a warning will be raised instead. + + Attributes + ---------- + {binner_dict_} + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + {transform} + + See Also + -------- + pandas.cut + """ + + def __init__( + self, + binning_dict: Dict[Union[str, int], List[Union[str, int]]], + return_object: bool = False, + return_boundaries: bool = False, + errors: str = "ignore", + ) -> None: + + if not isinstance(binning_dict, dict): + raise ValueError( + "binning_dict must be a dictionary with the interval limits per " + f"variable. Got {binning_dict} instead." + ) + + if errors not in ["ignore", "raise"]: + raise ValueError( + "errors only takes values 'ignore' and 'raise. " + f"Got {errors} instead." + ) + + super().__init__(return_object, return_boundaries) + + self.binning_dict = binning_dict + self.errors = errors \ No newline at end of file From e517953d851b13834d3049ee8c3a888cedc54a8a Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 10:23:03 -0700 Subject: [PATCH 02/29] create fit() --- feature_engine/discretisation/target_mean.py | 21 +++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index f2a40bdae..38b6e6432 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -92,4 +92,23 @@ def __init__( super().__init__(return_object, return_boundaries) self.binning_dict = binning_dict - self.errors = errors \ No newline at end of file + self.errors = errors + + def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): + """ + This transformer does not learn any parameter. + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training dataset. Can be the entire dataframe, not just the + variables to be transformed. + y: None + y is not needed in this transformer. You can pass y or None. + """ + # check dataframe + X = super()._fit_from_dict(X, self.binning_dict) + + # create this attribute for consistency with the rest of the discretisers + self.binner_dict_ = self.binning_dict + + def transform(self,): \ No newline at end of file From c823f7c4b52432327cfbf3642a096e7b8313186a Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 10:50:08 -0700 Subject: [PATCH 03/29] update init() --- feature_engine/discretisation/target_mean.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 38b6e6432..4ed086330 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -71,16 +71,22 @@ class TargetMeanDiscretiser(BaseDiscretiser): def __init__( self, - binning_dict: Dict[Union[str, int], List[Union[str, int]]], - return_object: bool = False, - return_boundaries: bool = False, + variables: Union[None, int, str, List[Union[str, int]]] = None, + strategy: str = "equal-frequency", + binning_dict: Dict[Union[str, int], List[Union[str, int]]] = None, errors: str = "ignore", ) -> None: - if not isinstance(binning_dict, dict): + if strategy not in ("equal-frequency", "equal-width", "arbitrary"): raise ValueError( - "binning_dict must be a dictionary with the interval limits per " - f"variable. Got {binning_dict} instead." + "strategy must equal 'arbitrary', 'equal-frequency', 'equal-width'. " + f"Got {strategy} instead." + ) + + if strategy == "arbitrary" and not isinstance(binning_dict, dict): + raise ValueError( + "If 'arbitrary' is the selected strategy, then binning_dict must be a " + f"dictionary with the interval limits per variable. Got {binning_dict} instead." ) if errors not in ["ignore", "raise"]: From 20b902a69244485ae72cd1f07e90033d4706be14 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 11:11:59 -0700 Subject: [PATCH 04/29] expand init() and fit() functionality --- feature_engine/discretisation/target_mean.py | 53 +++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 4ed086330..d76d51de5 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -14,7 +14,23 @@ _n_features_in_docstring, ) from feature_engine._docstrings.substitute import Substitution +from feature_engine.dataframe_checks import ( + _check_contains_inf, + _check_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.discretisation import ( + ArbitraryDiscretiser, + EqualFrequencyDiscretiser, + EqualWidthDiscretiser +) +from feature_engine.encoding import MeanEncoder from feature_engine.tags import _return_tags +from feature_engine.variable_manipulation import ( + _check_input_parameter_variables, + _find_or_check_numerical_variables, +) @Substitution( @@ -48,9 +64,11 @@ class TargetMeanDiscretiser(BaseDiscretiser): Attributes ---------- + {variables_} + {binner_dict_} - {variables_} + {feature_names_in_} @@ -77,7 +95,7 @@ def __init__( errors: str = "ignore", ) -> None: - if strategy not in ("equal-frequency", "equal-width", "arbitrary"): + if strategy not in ("arbitrary", "equal-frequency", "equal-width"): raise ValueError( "strategy must equal 'arbitrary', 'equal-frequency', 'equal-width'. " f"Got {strategy} instead." @@ -111,10 +129,33 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): y: None y is not needed in this transformer. You can pass y or None. """ - # check dataframe - X = super()._fit_from_dict(X, self.binning_dict) + # checks if dataset contains na or inf + # TODO: Will this identify the numerical variables? + # + X = super().transform(X) # create this attribute for consistency with the rest of the discretisers - self.binner_dict_ = self.binning_dict + if self.strategy == "arbitrary": + # check dataframe + X = super()._fit_from_dict(X, self.binning_dict) + self.binner_dict_ = self.binning_dict + + + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe of shape = [n_samples, n_features] + The transformed data with the means of the discrete variables. + + """ - def transform(self,): \ No newline at end of file + # checks if dataset contains na or inf + X = super().transform(X) \ No newline at end of file From 6403cf897b0277d949d311a73c4ec32d06223bc0 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 11:45:59 -0700 Subject: [PATCH 05/29] add functionality to fit() --- feature_engine/discretisation/target_mean.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index d76d51de5..e102fb278 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -129,10 +129,19 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): y: None y is not needed in this transformer. You can pass y or None. """ - # checks if dataset contains na or inf - # TODO: Will this identify the numerical variables? - # - X = super().transform(X) + # check if 'X' is a dataframe + X = check_X(X) + + # identify numerical variables + self.variables_numerical_ = _find_or_check_numerical_variables( + X, self.variables + ) + + # check for missing values + _check_contains_na(X, self.variables_numerical_) + + # check for inf + _check_contains_inf(X, self.variables_numerical_) # create this attribute for consistency with the rest of the discretisers if self.strategy == "arbitrary": @@ -141,7 +150,6 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): self.binner_dict_ = self.binning_dict - def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ From 9a0d662a818828e6dc8627741e218ce661a8fa7f Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 12:00:29 -0700 Subject: [PATCH 06/29] create _make_discretiser() --- feature_engine/discretisation/target_mean.py | 46 ++++++++++++++++---- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index e102fb278..aae9af3c3 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -44,7 +44,7 @@ fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, ) -class TargetMeanDiscretiser(BaseDiscretiser): +class TargetMeanDiscretiser(BaseeTargetMeanEstimator, BaseDiscretiser): """ Parameters @@ -90,14 +90,15 @@ class TargetMeanDiscretiser(BaseDiscretiser): def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, - strategy: str = "equal-frequency", + bins: int = 5, + strategy: str = "equal_frequency", binning_dict: Dict[Union[str, int], List[Union[str, int]]] = None, errors: str = "ignore", ) -> None: - - if strategy not in ("arbitrary", "equal-frequency", "equal-width"): + # TODO: do we include ArbitraryDiscretiser? + if strategy not in ("arbitrary", "equal_frequency", "equal_width"): raise ValueError( - "strategy must equal 'arbitrary', 'equal-frequency', 'equal-width'. " + "strategy must equal 'arbitrary', 'equal_frequency', 'equal_width'. " f"Got {strategy} instead." ) @@ -113,8 +114,9 @@ def __init__( f"Got {errors} instead." ) - super().__init__(return_object, return_boundaries) - + self.variables = variables + self.bins = bins + self.strategy = strategy self.binning_dict = binning_dict self.errors = errors @@ -136,7 +138,7 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): self.variables_numerical_ = _find_or_check_numerical_variables( X, self.variables ) - + # check for missing values _check_contains_na(X, self.variables_numerical_) @@ -166,4 +168,30 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ # checks if dataset contains na or inf - X = super().transform(X) \ No newline at end of file + X = super().transform(X) + + def _make_discretiser(self): + """ + Instantiate the ArbirtraryDiscretiser, EqualFrequencyDiscretiser, or + EqualWidthDiscretiser. + """ + # TODO: do we include ArbitraryDiscretiser? + if self.strategy == "arbitrary": + discretiser = ArbitraryDiscretiser( + + ) + + elif self.strategy == "equal_frequency": + discretiser = EqualFrequencyDiscretiser( + q=self.bins, + variables=self.variables_numerical_, + return_boundaries=True, + ) + else: + discretiser = EqualWidthDiscretiser( + bins=self.bins, + variables=self.variables_numerical_, + return_boundaries=True + ) + + return discretiser \ No newline at end of file From de4ae94ea5165c3528e8f2b2b2f3a8a2147862d7 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 16:00:06 -0700 Subject: [PATCH 07/29] create _make_pipeline --- feature_engine/discretisation/target_mean.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index aae9af3c3..31213aad8 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -32,6 +32,7 @@ _find_or_check_numerical_variables, ) +from sklearn.pipeline import Pipeline @Substitution( return_objects=BaseDiscretiser._return_object_docstring, @@ -151,7 +152,6 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): X = super()._fit_from_dict(X, self.binning_dict) self.binner_dict_ = self.binning_dict - def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -194,4 +194,20 @@ def _make_discretiser(self): return_boundaries=True ) - return discretiser \ No newline at end of file + return discretiser + + def _make_pipeline(self): + """ + Instantiate target mean encoder and create pipeline of selected + discretiser and encoder. + """ + encoder = MeanEncoder(variables=self.variables_numerical_, errors="raise") + + pipeline = Pipeline( + [ + ("discretiser", self._make_discretiser()), + ("encoder", encoder), + ] + ) + + return pipeline \ No newline at end of file From b6fac50b0e2a8c1cd8e94b806b7e23ebd80f340b Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 16 Apr 2022 20:42:31 -0700 Subject: [PATCH 08/29] expand fit() --- feature_engine/discretisation/target_mean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 31213aad8..4f4bedfe4 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -45,7 +45,7 @@ fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, ) -class TargetMeanDiscretiser(BaseeTargetMeanEstimator, BaseDiscretiser): +class TargetMeanDiscretiser(BaseDiscretiser): """ Parameters From a2360a62f45186a46032e8b894f4b4b6cc42864e Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sun, 17 Apr 2022 17:26:59 -0700 Subject: [PATCH 09/29] remove ArbitraryDiscretiser and correspdoning attributes --- feature_engine/discretisation/target_mean.py | 31 ++++---------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 4f4bedfe4..0f0956748 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -93,23 +93,15 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, bins: int = 5, strategy: str = "equal_frequency", - binning_dict: Dict[Union[str, int], List[Union[str, int]]] = None, errors: str = "ignore", ) -> None: - # TODO: do we include ArbitraryDiscretiser? - if strategy not in ("arbitrary", "equal_frequency", "equal_width"): + if strategy not in ("equal_frequency", "equal_width"): raise ValueError( - "strategy must equal 'arbitrary', 'equal_frequency', 'equal_width'. " + "strategy must equal 'equal_frequency' or 'equal_width'. " f"Got {strategy} instead." ) - if strategy == "arbitrary" and not isinstance(binning_dict, dict): - raise ValueError( - "If 'arbitrary' is the selected strategy, then binning_dict must be a " - f"dictionary with the interval limits per variable. Got {binning_dict} instead." - ) - - if errors not in ["ignore", "raise"]: + if errors not in ("ignore", "raise"): raise ValueError( "errors only takes values 'ignore' and 'raise. " f"Got {errors} instead." @@ -118,7 +110,6 @@ def __init__( self.variables = variables self.bins = bins self.strategy = strategy - self.binning_dict = binning_dict self.errors = errors def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): @@ -146,11 +137,6 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): # check for inf _check_contains_inf(X, self.variables_numerical_) - # create this attribute for consistency with the rest of the discretisers - if self.strategy == "arbitrary": - # check dataframe - X = super()._fit_from_dict(X, self.binning_dict) - self.binner_dict_ = self.binning_dict def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -172,16 +158,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: def _make_discretiser(self): """ - Instantiate the ArbirtraryDiscretiser, EqualFrequencyDiscretiser, or - EqualWidthDiscretiser. + Instantiate the EqualFrequencyDiscretiser or EqualWidthDiscretiser. """ - # TODO: do we include ArbitraryDiscretiser? - if self.strategy == "arbitrary": - discretiser = ArbitraryDiscretiser( - - ) - - elif self.strategy == "equal_frequency": + if self.strategy == "equal_frequency": discretiser = EqualFrequencyDiscretiser( q=self.bins, variables=self.variables_numerical_, From bf2fc626aa0fc1c31c8168861359d18ff30b0d02 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sun, 17 Apr 2022 18:23:44 -0700 Subject: [PATCH 10/29] update fit() --- feature_engine/discretisation/target_mean.py | 31 +++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 0f0956748..dfd390930 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -95,6 +95,11 @@ def __init__( strategy: str = "equal_frequency", errors: str = "ignore", ) -> None: + + if not isinstance(bins, int): + raise ValueError( + f"bins must be an integer. Got {bins} instead." + ) if strategy not in ("equal_frequency", "equal_width"): raise ValueError( "strategy must equal 'equal_frequency' or 'equal_width'. " @@ -107,21 +112,24 @@ def __init__( f"Got {errors} instead." ) - self.variables = variables + self.variables = _check_input_parameter_variables(variables) self.bins = bins self.strategy = strategy self.errors = errors - def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ - This transformer does not learn any parameter. + This transformer discretises the selected numerical variables and + learns the target mean value for each bin. + Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. Can be the entire dataframe, not just the variables to be transformed. - y: None - y is not needed in this transformer. You can pass y or None. + + y : pandas series of shape = [n_samples,] + The target variable. """ # check if 'X' is a dataframe X = check_X(X) @@ -131,13 +139,15 @@ def fit(self, X: pd.DataFrame, y:Optional[pd.Series] = None): X, self.variables ) + # create dataframe to use for target values. + self.x_target_ = X[self.variables_numerical_].copy() + # check for missing values _check_contains_na(X, self.variables_numerical_) # check for inf _check_contains_inf(X, self.variables_numerical_) - def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -175,18 +185,11 @@ def _make_discretiser(self): return discretiser - def _make_pipeline(self): + def _calc_target_mean_for_discretised_variables(self): """ Instantiate target mean encoder and create pipeline of selected discretiser and encoder. """ - encoder = MeanEncoder(variables=self.variables_numerical_, errors="raise") - pipeline = Pipeline( - [ - ("discretiser", self._make_discretiser()), - ("encoder", encoder), - ] - ) return pipeline \ No newline at end of file From 23baacbbd847bba7393a68ea71c615bc391cd191 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 19 Apr 2022 15:36:11 -0700 Subject: [PATCH 11/29] update fit() --- feature_engine/discretisation/target_mean.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index dfd390930..63f651306 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -119,8 +119,8 @@ def __init__( def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ - This transformer discretises the selected numerical variables and - learns the target mean value for each bin. + Learn the boundaries of the selected dicretiser's intervals / bins + for the chosen numerical variables. Parameters ---------- @@ -129,18 +129,18 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): variables to be transformed. y : pandas series of shape = [n_samples,] - The target variable. + y is not needed in this discretiser. You can pass y or None. """ # check if 'X' is a dataframe X = check_X(X) - # identify numerical variables + # identify numerical variables self.variables_numerical_ = _find_or_check_numerical_variables( X, self.variables ) # create dataframe to use for target values. - self.x_target_ = X[self.variables_numerical_].copy() + self.X_target_ = X[self.variables_numerical_].copy() # check for missing values _check_contains_na(X, self.variables_numerical_) @@ -148,6 +148,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check for inf _check_contains_inf(X, self.variables_numerical_) + # discretise + self._discretiser = self._make_discretiser() + self._discretiser.fit(X) + + # store input features + self.n_features_in_ = X.shape[1] + self.feature_names_in_ = list(X.columns) + + return self + def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ From 82506460f0f3a4b0a58824904fe0b96582e6f316 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 19 Apr 2022 15:52:33 -0700 Subject: [PATCH 12/29] update transform() and _encode_X() --- feature_engine/discretisation/target_mean.py | 44 ++++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 63f651306..3c21510c4 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -160,6 +160,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ + Replace original values by the average of the target mean value per bin + for each of the variables. Parameters ---------- @@ -168,13 +170,32 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: Returns ------- - X_new: pandas dataframe of shape = [n_samples, n_features] - The transformed data with the means of the discrete variables. + X_enc: pandas dataframe of shape = [n_samples, n_features] + The transformed data with the means of the selected numerical variables. """ + # check that fit method has been called + check_is_fitted(self) - # checks if dataset contains na or inf - X = super().transform(X) + # check that input is a dataframe + X = check_X(X) + + # check that input data contain number of columns as the fitted df + _check_X_matches_training_df(X, self.n_features_in_) + + # check for missing values + _check_contains_na(X, self.variables_numerical_) + + # check for infinite values + _check_contains_inf(X, self.variables_numerical_) + + # discretise + X_disc = self._discretiser.transform(X) + + # encode + X_enc = self._encode_X(X_disc) + + return X_enc def _make_discretiser(self): """ @@ -195,11 +216,18 @@ def _make_discretiser(self): return discretiser - def _calc_target_mean_for_discretised_variables(self): + def _encode_X(self, X): """ - Instantiate target mean encoder and create pipeline of selected - discretiser and encoder. + Calculate the mean of each bin using the initial values (prior to + discretisation) for each selected variable. Replace the discrete value + (bin) with the corresponding mean. """ + X_enc = X.copy() + X_enc[self.variables_numerical_] = X_enc[self.variables_numerical_].astype(str) + for variable in self.variables_numerical_: + encoder = MeanEncoder(variables=variable) + encoder.fit(X_enc, self.X_target_[variable]) + X_enc = encoder.transform(X_enc) - return pipeline \ No newline at end of file + return X_enc \ No newline at end of file From 0ac284c07b415f97a4bad941b495aca006aac157 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 19 Apr 2022 16:33:06 -0700 Subject: [PATCH 13/29] add TargetMeanDiscretiser to test_check_estimator_discretisers.py --- .../test_discretisation/test_check_estimator_discretisers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_discretisation/test_check_estimator_discretisers.py b/tests/test_discretisation/test_check_estimator_discretisers.py index d501f46f1..553c05ecd 100644 --- a/tests/test_discretisation/test_check_estimator_discretisers.py +++ b/tests/test_discretisation/test_check_estimator_discretisers.py @@ -7,6 +7,7 @@ DecisionTreeDiscretiser, EqualFrequencyDiscretiser, EqualWidthDiscretiser, + TargetMeanDiscretiser, ) from tests.estimator_checks.estimator_checks import check_feature_engine_estimator @@ -14,7 +15,8 @@ DecisionTreeDiscretiser(regression=False), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), - ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}), + ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}) + TargetMeanDiscretiser(), ] From 265fd08ffa798db44b4cbf3af59fc2519eac5bd1 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 19 Apr 2022 17:03:08 -0700 Subject: [PATCH 14/29] create test_target_mean_discretiser.py includes initial test --- feature_engine/discretisation/__init__.py | 1 + feature_engine/discretisation/target_mean.py | 2 +- .../test_target_mean_discretiser.py | 37 +++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 tests/test_discretisation/test_target_mean_discretiser.py diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py index a305c8a93..0c140d8d2 100644 --- a/feature_engine/discretisation/__init__.py +++ b/feature_engine/discretisation/__init__.py @@ -13,4 +13,5 @@ "EqualFrequencyDiscretiser", "EqualWidthDiscretiser", "ArbitraryDiscretiser", + "TargetMeanDiscretiser" ] diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 3c21510c4..95e5d2111 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -91,7 +91,7 @@ class TargetMeanDiscretiser(BaseDiscretiser): def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, - bins: int = 5, + bins: int = 10, strategy: str = "equal_frequency", errors: str = "ignore", ) -> None: diff --git a/tests/test_discretisation/test_target_mean_discretiser.py b/tests/test_discretisation/test_target_mean_discretiser.py new file mode 100644 index 000000000..365ce8ba4 --- /dev/null +++ b/tests/test_discretisation/test_target_mean_discretiser.py @@ -0,0 +1,37 @@ +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError + +from feature_engine.discretisation import TargetMeanDiscretiser + + +def test_equal_frequency_automatically_find_variables_and_return_as_numeric( + df_normal_dist +): + # test case 1: auto-select variables, return_object=False + transformer = TargetMeanDiscretiser( + strategy="equal_frequency", bins=10, variables=None, return_object=False + ) + X = transformer.fit_transform(df_normal_dist) + + # fit parameters + _, bins = pd.cut(x=df_normal_dist["var"], bins=10, retbins=True, duplicates="drop") + bins[0] = float("-inf") + bins[len(bins) - 1] = float("inf") + + # transform output + X_t = [x for x in range(0, 10)] + + # test init params + assert transformer.bins == 10 + assert transformer.variables is None + assert transformer.return_object is None + # test fit attr + assert transformer.variables_ == ["var"] + assert transformer.n_features_in_ == 1 + # test transform output + assert (transformer.binner_dict_["var"] == bins).all() + assert all(x for x in X["var"].unique() if x not in X_t) + # in equal-frequency discretisation, all intervals have the same proportion of values + assert len((X["var"].value_counts()).unqiue()) == 1 + From f576e3d9d04c67f22e69beb5a72960a65a10c665 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Wed, 20 Apr 2022 21:20:28 -0700 Subject: [PATCH 15/29] update unit tests --- .../test_target_mean_discretiser.py | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/test_discretisation/test_target_mean_discretiser.py b/tests/test_discretisation/test_target_mean_discretiser.py index 365ce8ba4..8094c2e67 100644 --- a/tests/test_discretisation/test_target_mean_discretiser.py +++ b/tests/test_discretisation/test_target_mean_discretiser.py @@ -8,7 +8,7 @@ def test_equal_frequency_automatically_find_variables_and_return_as_numeric( df_normal_dist ): - # test case 1: auto-select variables, return_object=False + # fit discretiser and transform dataset transformer = TargetMeanDiscretiser( strategy="equal_frequency", bins=10, variables=None, return_object=False ) @@ -35,3 +35,42 @@ def test_equal_frequency_automatically_find_variables_and_return_as_numeric( # in equal-frequency discretisation, all intervals have the same proportion of values assert len((X["var"].value_counts()).unqiue()) == 1 + +def test_equal_width_automatically_find_variables_and_return_as_numeric( + df_normal_dist +): + transformer = TargetMeanDiscretiser( + strategy="equal_width", bins=10, variables=None, return_object=False + ) + X = transformer.fit_transform(df_normal_dist) + + # fit parameters + _, bins = pd.qcut(x=df_normal_dist["var"], q=10, retbins=True, duplicates="drop") + bins[0] = float("-inf") + bins[len(bins) - 1] = float("inf") + + # transform output + X_t = [x for x in range(0, 10)] + val_counts = [18, 17, 16, 13, 11, 7, 7, 5, 5, 1] + + # init params + assert transformer.bins == 10 + assert transformer.variables is None + assert transformer.return_object is False + # fit params + assert transformer.variables_ == ["var"] + assert transformer.n_features_in_ == 1 + # transform params + assert (transformer.binner_dict_["var"] == bins).all() + assert all(x for x in X["var"].unique() if x not in X_t) + # in equal-width discretisation, intervals have number of values + assert all(x for x in ["var"].value_counts() if x not in val_counts) + + +def test_automatically_find_variables_and_return_as_object(df_normal_dist): + # equal-frequency + transformer = TargetMeanDiscretiser( + strategy="equal_frequency", bins=10, variables=None, return_object=True + ) + X = transformer.fit_transform(df_normal_dist) + assert X["var"].dtypes == "O" From 86cbbf573dd821cba8115b3150be544606715b15 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Wed, 20 Apr 2022 21:29:41 -0700 Subject: [PATCH 16/29] edit docstring --- feature_engine/discretisation/target_mean.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 95e5d2111..92581dee0 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -39,6 +39,7 @@ return_boundaries=BaseDiscretiser._return_boundaries_docstring, binner_dict_=BaseDiscretiser._binner_dict_docstring, transform=BaseDiscretiser._transform_docstring, + variables=_variables_numerical_docstring, variables_=_variables_attribute_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, @@ -50,12 +51,14 @@ class TargetMeanDiscretiser(BaseDiscretiser): Parameters ---------- - binning_dict: dict - The dictionary with the variable to interval limits pairs. + strategy: str, default='equal_width' + Whether the bins should of equal width ('equal_width') or equal frequency + ('equal_frequency'). - {return_object} + {variables} - {return_boundaries} + bins: int, default=10 + Desired number of equal-width or equal-distance intervals / bins. errors: string, default='ignore' Indicates what to do when a value is outside the limits indicated in the @@ -69,8 +72,6 @@ class TargetMeanDiscretiser(BaseDiscretiser): {binner_dict_} - - {feature_names_in_} {n_features_in_} From 20317eea7f52f5facb847b8a3aba4358f96d6e9a Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Wed, 20 Apr 2022 21:43:02 -0700 Subject: [PATCH 17/29] add tests --- .../test_target_mean_discretiser.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_discretisation/test_target_mean_discretiser.py b/tests/test_discretisation/test_target_mean_discretiser.py index 8094c2e67..46afb2625 100644 --- a/tests/test_discretisation/test_target_mean_discretiser.py +++ b/tests/test_discretisation/test_target_mean_discretiser.py @@ -67,10 +67,8 @@ def test_equal_width_automatically_find_variables_and_return_as_numeric( assert all(x for x in ["var"].value_counts() if x not in val_counts) -def test_automatically_find_variables_and_return_as_object(df_normal_dist): - # equal-frequency - transformer = TargetMeanDiscretiser( - strategy="equal_frequency", bins=10, variables=None, return_object=True - ) - X = transformer.fit_transform(df_normal_dist) - assert X["var"].dtypes == "O" +@pytest.mark.parameterize("_bins", [4.2, "python", ["data", "science"]]) +def test_error_when_bins_not_integer(_bins): + with pytest.raises(ValueError): + TargetMeanDiscretiser(bins=_bins) + From f6761275c63fc0af0dd25cc294960935b80f54c4 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 7 May 2022 12:26:16 -0400 Subject: [PATCH 18/29] update fit() --- feature_engine/discretisation/target_mean.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 92581dee0..8d1398ef2 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -19,6 +19,7 @@ _check_contains_na, _check_X_matches_training_df, check_X, + check_X_y, ) from feature_engine.discretisation import ( ArbitraryDiscretiser, @@ -118,7 +119,7 @@ def __init__( self.strategy = strategy self.errors = errors - def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the boundaries of the selected dicretiser's intervals / bins for the chosen numerical variables. @@ -133,7 +134,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y is not needed in this discretiser. You can pass y or None. """ # check if 'X' is a dataframe - X = check_X(X) + X, y = check_X_y(X, y) # identify numerical variables self.variables_numerical_ = _find_or_check_numerical_variables( From c6372ba4c90be2cb20cae92ed25c34ac49ab3467 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sun, 8 May 2022 18:50:49 -0700 Subject: [PATCH 19/29] (1) add _make_pipeline(); and (2) update fit() and transform() --- feature_engine/discretisation/target_mean.py | 47 ++++++++------------ 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 8d1398ef2..fe1943b95 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -1,7 +1,8 @@ -import warnings -from typing import Dict, List, Optional, Union +from typing import List, Union import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.utils.validation import check_is_fitted from feature_engine.discretisation.base_discretiser import BaseDiscretiser from feature_engine._docstrings.methods import ( @@ -27,13 +28,11 @@ EqualWidthDiscretiser ) from feature_engine.encoding import MeanEncoder -from feature_engine.tags import _return_tags from feature_engine.variable_manipulation import ( _check_input_parameter_variables, _find_or_check_numerical_variables, ) -from sklearn.pipeline import Pipeline @Substitution( return_objects=BaseDiscretiser._return_object_docstring, @@ -141,18 +140,15 @@ def fit(self, X: pd.DataFrame, y: pd.Series): X, self.variables ) - # create dataframe to use for target values. - self.X_target_ = X[self.variables_numerical_].copy() - # check for missing values _check_contains_na(X, self.variables_numerical_) # check for inf _check_contains_inf(X, self.variables_numerical_) - # discretise - self._discretiser = self._make_discretiser() - self._discretiser.fit(X) + # instantiate pipeline + self._pipeline = self._make_pipeline() + self._pipeline.fit(X, y) # store input features self.n_features_in_ = X.shape[1] @@ -191,13 +187,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check for infinite values _check_contains_inf(X, self.variables_numerical_) - # discretise - X_disc = self._discretiser.transform(X) - - # encode - X_enc = self._encode_X(X_disc) + # discretise and encode + X_tr = self._pipeline.transform(X) - return X_enc + return X_tr def _make_discretiser(self): """ @@ -218,18 +211,16 @@ def _make_discretiser(self): return discretiser - def _encode_X(self, X): + def _make_pipeline(self): """ - Calculate the mean of each bin using the initial values (prior to - discretisation) for each selected variable. Replace the discrete value - (bin) with the corresponding mean. + Instantiate pipeline comprised of discretiser and encoder. """ - X_enc = X.copy() - X_enc[self.variables_numerical_] = X_enc[self.variables_numerical_].astype(str) - - for variable in self.variables_numerical_: - encoder = MeanEncoder(variables=variable) - encoder.fit(X_enc, self.X_target_[variable]) - X_enc = encoder.transform(X_enc) + pipe = Pipeline([ + ("discretiser", self._make_discretiser()), + ("encoder", MeanEncoder( + variables=self.variables_numerical_, + ignore_format=True) + )] + ) - return X_enc \ No newline at end of file + return pipe \ No newline at end of file From d843d0e08f5228bcfcd70bf1941275544bc251a6 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sun, 8 May 2022 18:51:30 -0700 Subject: [PATCH 20/29] fix style error --- feature_engine/discretisation/target_mean.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index fe1943b95..993d3d846 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -223,4 +223,5 @@ def _make_pipeline(self): )] ) - return pipe \ No newline at end of file + return pipe + \ No newline at end of file From 138b20186dc2f20f224765741874cc4de9b8a847 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 9 May 2022 16:26:48 -0700 Subject: [PATCH 21/29] create unit test and fix bugs --- feature_engine/discretisation/__init__.py | 1 + feature_engine/discretisation/target_mean.py | 2 +- .../test_target_mean_discretiser.py | 82 +++++-------------- 3 files changed, 23 insertions(+), 62 deletions(-) diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py index 0c140d8d2..d91eff1e7 100644 --- a/feature_engine/discretisation/__init__.py +++ b/feature_engine/discretisation/__init__.py @@ -7,6 +7,7 @@ from .decision_tree import DecisionTreeDiscretiser from .equal_frequency import EqualFrequencyDiscretiser from .equal_width import EqualWidthDiscretiser +from .target_mean import TargetMeanDiscretiser __all__ = [ "DecisionTreeDiscretiser", diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 993d3d846..c2f3b6c9c 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -5,6 +5,7 @@ from sklearn.utils.validation import check_is_fitted from feature_engine.discretisation.base_discretiser import BaseDiscretiser +from feature_engine._docstrings.class_inputs import _variables_numerical_docstring from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, _fit_transform_docstring @@ -224,4 +225,3 @@ def _make_pipeline(self): ) return pipe - \ No newline at end of file diff --git a/tests/test_discretisation/test_target_mean_discretiser.py b/tests/test_discretisation/test_target_mean_discretiser.py index 46afb2625..cbaf43c82 100644 --- a/tests/test_discretisation/test_target_mean_discretiser.py +++ b/tests/test_discretisation/test_target_mean_discretiser.py @@ -5,70 +5,30 @@ from feature_engine.discretisation import TargetMeanDiscretiser -def test_equal_frequency_automatically_find_variables_and_return_as_numeric( - df_normal_dist -): - # fit discretiser and transform dataset - transformer = TargetMeanDiscretiser( - strategy="equal_frequency", bins=10, variables=None, return_object=False - ) - X = transformer.fit_transform(df_normal_dist) - - # fit parameters - _, bins = pd.cut(x=df_normal_dist["var"], bins=10, retbins=True, duplicates="drop") - bins[0] = float("-inf") - bins[len(bins) - 1] = float("inf") - - # transform output - X_t = [x for x in range(0, 10)] - - # test init params - assert transformer.bins == 10 - assert transformer.variables is None - assert transformer.return_object is None - # test fit attr - assert transformer.variables_ == ["var"] - assert transformer.n_features_in_ == 1 - # test transform output - assert (transformer.binner_dict_["var"] == bins).all() - assert all(x for x in X["var"].unique() if x not in X_t) - # in equal-frequency discretisation, all intervals have the same proportion of values - assert len((X["var"].value_counts()).unqiue()) == 1 +def test_discretiser_using_equal_frequency(): + data = { + "var_A": list(range(1, 11)), + "var_B": list(range(2, 22, 2)), + "var_C": ["A"] * 3 + ["B"] + ["C"] * 4 + ["D"] * 2, + "var_D": list(range(3, 33, 3)), + } + df = pd.DataFrame(data) + target = list(range(10)) - -def test_equal_width_automatically_find_variables_and_return_as_numeric( - df_normal_dist -): transformer = TargetMeanDiscretiser( - strategy="equal_width", bins=10, variables=None, return_object=False + variables=["var_A", "var_D"], + bins=2 ) - X = transformer.fit_transform(df_normal_dist) - - # fit parameters - _, bins = pd.qcut(x=df_normal_dist["var"], q=10, retbins=True, duplicates="drop") - bins[0] = float("-inf") - bins[len(bins) - 1] = float("inf") - - # transform output - X_t = [x for x in range(0, 10)] - val_counts = [18, 17, 16, 13, 11, 7, 7, 5, 5, 1] - - # init params - assert transformer.bins == 10 - assert transformer.variables is None - assert transformer.return_object is False - # fit params - assert transformer.variables_ == ["var"] - assert transformer.n_features_in_ == 1 - # transform params - assert (transformer.binner_dict_["var"] == bins).all() - assert all(x for x in X["var"].unique() if x not in X_t) - # in equal-width discretisation, intervals have number of values - assert all(x for x in ["var"].value_counts() if x not in val_counts) + df_tr = transformer.fit_transform(df, target) -@pytest.mark.parameterize("_bins", [4.2, "python", ["data", "science"]]) -def test_error_when_bins_not_integer(_bins): - with pytest.raises(ValueError): - TargetMeanDiscretiser(bins=_bins) + # + expected_results = { + "var_A": [2.0] * 5 + [7.0] * 5, + "var_B": list(range(2, 22, 2)), + "var_C": ["A"] * 3 + ["B"] + ["C"] * 4 + ["D"] * 2, + "var_D": [2.0] * 5 + [7.0] * 5, + } + expected_results_df = pd.DataFrame(expected_results) + assert df_tr.equals(expected_results_df) From 5a229d4e7f27573128a523434398e54f612ca995 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 9 May 2022 16:43:35 -0700 Subject: [PATCH 22/29] create test_equal_width_strategy --- .../test_target_mean_discretiser.py | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/test_discretisation/test_target_mean_discretiser.py b/tests/test_discretisation/test_target_mean_discretiser.py index cbaf43c82..03c5deead 100644 --- a/tests/test_discretisation/test_target_mean_discretiser.py +++ b/tests/test_discretisation/test_target_mean_discretiser.py @@ -5,7 +5,7 @@ from feature_engine.discretisation import TargetMeanDiscretiser -def test_discretiser_using_equal_frequency(): +def test_equal_frequency_strategy(): data = { "var_A": list(range(1, 11)), "var_B": list(range(2, 22, 2)), @@ -17,12 +17,12 @@ def test_discretiser_using_equal_frequency(): transformer = TargetMeanDiscretiser( variables=["var_A", "var_D"], - bins=2 + bins=2, + strategy="equal_frequency", ) df_tr = transformer.fit_transform(df, target) - # expected_results = { "var_A": [2.0] * 5 + [7.0] * 5, "var_B": list(range(2, 22, 2)), @@ -32,3 +32,30 @@ def test_discretiser_using_equal_frequency(): expected_results_df = pd.DataFrame(expected_results) assert df_tr.equals(expected_results_df) + + +def test_equal_width_strategy(): + data = { + "var_W": list(range(5, 55, 5)), + "var_X": ["W"] * 3 + ["X"] + ["Y"] * 4 + ["Z"] * 2, + "var_Y": list(range(3, 33, 3)), + "var_Z": list(range(4, 44, 4)), + } + df = pd.DataFrame(data) + target = list(range(10, 30, 2)) + transformer = TargetMeanDiscretiser( + variables=["var_Y", "var_Z"], + bins=3, + strategy="equal_width", + ) + df_tr = transformer.fit_transform(df, target) + + expected_results = { + "var_W": list(range(5, 55, 5)), + "var_X": ["W"] * 3 + ["X"] + ["Y"] * 4 + ["Z"] * 2, + "var_Y": [13.0] * 4 + [20.0] * 3 + [26.0] * 3, + "var_Z": [13.0] * 4 + [20.0] * 3 + [26.0] * 3, + } + expected_results_df = pd.DataFrame(expected_results) + + assert df_tr.equals(expected_results_df) From 82f5acc834aeb99fd52d0ae52bc47a91e45b89e9 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 10 May 2022 16:43:36 -0700 Subject: [PATCH 23/29] fix errors --- feature_engine/discretisation/target_mean.py | 15 +++++++-------- .../test_target_mean_discretiser.py | 2 -- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index c2f3b6c9c..132789afc 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -4,16 +4,15 @@ from sklearn.pipeline import Pipeline from sklearn.utils.validation import check_is_fitted -from feature_engine.discretisation.base_discretiser import BaseDiscretiser from feature_engine._docstrings.class_inputs import _variables_numerical_docstring -from feature_engine._docstrings.methods import ( - _fit_not_learn_docstring, - _fit_transform_docstring -) from feature_engine._docstrings.fit_attributes import ( - _variables_attribute_docstring, _feature_names_in_docstring, _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring, ) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import ( @@ -24,10 +23,10 @@ check_X_y, ) from feature_engine.discretisation import ( - ArbitraryDiscretiser, EqualFrequencyDiscretiser, - EqualWidthDiscretiser + EqualWidthDiscretiser, ) +from feature_engine.discretisation.base_discretiser import BaseDiscretiser from feature_engine.encoding import MeanEncoder from feature_engine.variable_manipulation import ( _check_input_parameter_variables, diff --git a/tests/test_discretisation/test_target_mean_discretiser.py b/tests/test_discretisation/test_target_mean_discretiser.py index 03c5deead..ded841fcc 100644 --- a/tests/test_discretisation/test_target_mean_discretiser.py +++ b/tests/test_discretisation/test_target_mean_discretiser.py @@ -1,6 +1,4 @@ import pandas as pd -import pytest -from sklearn.exceptions import NotFittedError from feature_engine.discretisation import TargetMeanDiscretiser From ddd56e5e42109f860829f7806fbe205587d6bbc2 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 10 May 2022 16:48:27 -0700 Subject: [PATCH 24/29] create rst file --- docs/api_doc/discretisation/TargetMeanDiscretiser.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/api_doc/discretisation/TargetMeanDiscretiser.rst diff --git a/docs/api_doc/discretisation/TargetMeanDiscretiser.rst b/docs/api_doc/discretisation/TargetMeanDiscretiser.rst new file mode 100644 index 000000000..437ff623d --- /dev/null +++ b/docs/api_doc/discretisation/TargetMeanDiscretiser.rst @@ -0,0 +1,5 @@ +TargetMeanDiscretiser +===================== + +.. autoclass:: feature_engine.discretisation.TargetMeanDiscretiser + :members: \ No newline at end of file From 0a923a608a7fb9b491061eaf29c1f5ab12fea4b8 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 10 May 2022 17:52:49 -0700 Subject: [PATCH 25/29] start user guide w/ demo --- .../discretisation/TargetMeanDiscretiser.rst | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 docs/user_guide/discretisation/TargetMeanDiscretiser.rst diff --git a/docs/user_guide/discretisation/TargetMeanDiscretiser.rst b/docs/user_guide/discretisation/TargetMeanDiscretiser.rst new file mode 100644 index 000000000..4889bc6ed --- /dev/null +++ b/docs/user_guide/discretisation/TargetMeanDiscretiser.rst @@ -0,0 +1,84 @@ +.. _target_mean_discretiser: + +.. currentmodule:: feature_engine.discretisation + +TargetMeanDiscretiser +===================== + +The :class:`TargetMeanDiscretiser()` sorts numerical variables and organizes the values into bins +using either :class:`EqualFrequencyDiscretiser()` or :class:`EqualWidthDiscretiser()`. Once the numerical +variables are separated into bins, :class:`MeanEncoder()` replaces categories with the mean of the +target per bin interval. The number of bins is determined by the user. + +Let's look at an example using the California Housing Dataset. + +First, let's load the data and separate it into train and test: + +.. code:: python + + import numpy as np + import pandas as pd + import matplotlib.pyplot as plt + from sklearn.datasets import fetch_california_housing + from sklearn.model_selection import train_test_split + + from feature_engine.discretisation import TargetMeanDiscretiser + + # Load dataset + california_dataset = fetch_california_housing() + data = pd.DataFrame(california_dataset.data, columns=california_dataset.feature_names) + + # Seperate into train and test sets + X_train, X_test, y_train, y_test = train_test_split( + data, california_dataset["target"], test_size=0.3, + random_state=0) + +Now, we set up the :class:`TargetMeanDiscretiser()` to encode the discretised bins and replace +the bin indices only in the 3 indicated variables using the :class:`EqualFrequencyDiscretiser()`: + +.. code:: python + + # set up the discretisation transformer + disc = TargetMeanDiscretiser(variables=["HouseAge", "AveRooms", "Population"], + strategy="equal_frequency", + bins=5) + + # fit the transformer + disc.fit(X_train, y_train) + +With `fit()` the transformer learns the boundaries of each interval. Then, we can go +ahead and sort the values into the intervals. The transformer learns the target mean +value for each interval, which are stored in `encoder_dict_` parameter: + +.. code:: python + + disc._pipeline["encoder"].encoder_dict_ + +The `encoder_dict_` contains the mean value of the target per bin interval, per variable. +So we can easily use this dictionary to map the numbers to the discretised bins. + +.. code:: python + + {'HouseAge': {Interval(-inf, 17.0, closed='right'): 2.0806529160739684, + Interval(17.0, 25.0, closed='right'): 2.097539197771588, + Interval(25.0, 33.0, closed='right'): 2.0686614742967993, + Interval(33.0, 40.0, closed='right'): 2.1031412685185185, + Interval(40.0, inf, closed='right'): 2.0266248845381525}, + 'AveRooms': {Interval(-inf, 4.281, closed='right'): 2.0751556984478934, + Interval(4.281, 4.94, closed='right'): 2.0353196247563354, + Interval(4.94, 5.524, closed='right'): 2.122038111675127, + Interval(5.524, 6.258, closed='right'): 2.0422810965372507, + Interval(6.258, inf, closed='right'): 2.103166361757106}, + 'Population': {Interval(-inf, 709.0, closed='right'): 2.0853869883779685, + Interval(709.0, 1004.0, closed='right'): 2.0658340239808153, + Interval(1004.0, 1346.0, closed='right'): 2.0712619255907487, + Interval(1346.0, 1905.0, closed='right'): 2.0454417591204397, + Interval(1905.0, inf, closed='right'): 2.108366283914729}} + +We can now go ahead and replace the bins with the numbers: + +..code:: python + + # transform the data + train_t = disc.transform(X_train) + test_t = disc.transform(X_test) \ No newline at end of file From d2782034668b1f26a46f0bac298f25634f55439c Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 10 May 2022 18:04:53 -0700 Subject: [PATCH 26/29] fix style error --- tests/test_discretisation/test_check_estimator_discretisers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_discretisation/test_check_estimator_discretisers.py b/tests/test_discretisation/test_check_estimator_discretisers.py index 553c05ecd..4d4154598 100644 --- a/tests/test_discretisation/test_check_estimator_discretisers.py +++ b/tests/test_discretisation/test_check_estimator_discretisers.py @@ -15,7 +15,7 @@ DecisionTreeDiscretiser(regression=False), EqualFrequencyDiscretiser(), EqualWidthDiscretiser(), - ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}) + ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}), TargetMeanDiscretiser(), ] From 1a83491d3cb277e9c595dd5cda5e1cc50580bb0b Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 10 May 2022 18:15:03 -0700 Subject: [PATCH 27/29] update docs/index.rst --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index a38ee6b19..aa117b131 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -138,6 +138,7 @@ Variable Discretisation: Discretisers - :doc:`api_doc/discretisation/EqualFrequencyDiscretiser`: sorts variable into equal frequency intervals - :doc:`api_doc/discretisation/EqualWidthDiscretiser`: sorts variable into equal width intervals - :doc:`api_doc/discretisation/DecisionTreeDiscretiser`: uses decision trees to create finite variables +- :doc:`api_doc/discretisation/TargetMeanDiscretiser`: sorts variable into equal frequency or equal width intervals then replaces intervals by the target mean Outlier Capping or Removal -------------------------- From 8d7de98fab9cec76f2d30f090c6dd974614ce983 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 10 May 2022 18:26:39 -0700 Subject: [PATCH 28/29] update api_doc/discretisation/index.rst --- docs/api_doc/discretisation/index.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/api_doc/discretisation/index.rst b/docs/api_doc/discretisation/index.rst index ac000c160..47121ea80 100644 --- a/docs/api_doc/discretisation/index.rst +++ b/docs/api_doc/discretisation/index.rst @@ -16,7 +16,8 @@ into continuous intervals. :class:`EqualFrequencyDiscretiser()` Sorts values into intervals with similar number of observations. :class:`EqualWidthDiscretiser()` Sorts values into intervals of equal size. :class:`ArbitraryDiscretiser()` Sorts values into intervals predefined by the user. -:class:`DecisionTreeDiscretiser()` Replaces values by predictions of a decision tree, which are discrete +:class:`DecisionTreeDiscretiser()` Replaces values by predictions of a decision tree, which are discrete. +:class:`TargetMeanDiscretiser()` Sorts variable into equal frequency or equal width intervals then replaces intervals by the target mean. ===================================== ======================================================================== @@ -28,6 +29,7 @@ into continuous intervals. EqualWidthDiscretiser ArbitraryDiscretiser DecisionTreeDiscretiser + TargetMeanDiscretiser Additional transformers for discretisation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From cddf8737cb5c17f9e6232031e5c477fc5b4f8ac7 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Wed, 11 May 2022 16:32:50 -0700 Subject: [PATCH 29/29] fix errors --- docs/user_guide/discretisation/index.rst | 1 + feature_engine/discretisation/target_mean.py | 25 ++++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/user_guide/discretisation/index.rst b/docs/user_guide/discretisation/index.rst index f61daa022..796b1627f 100644 --- a/docs/user_guide/discretisation/index.rst +++ b/docs/user_guide/discretisation/index.rst @@ -34,3 +34,4 @@ Throughout the user guide, we point to jupyter notebooks that showcase this func EqualWidthDiscretiser ArbitraryDiscretiser DecisionTreeDiscretiser + TargetMeanDiscretiser diff --git a/feature_engine/discretisation/target_mean.py b/feature_engine/discretisation/target_mean.py index 132789afc..f0e6e2535 100644 --- a/feature_engine/discretisation/target_mean.py +++ b/feature_engine/discretisation/target_mean.py @@ -136,15 +136,15 @@ def fit(self, X: pd.DataFrame, y: pd.Series): X, y = check_X_y(X, y) # identify numerical variables - self.variables_numerical_ = _find_or_check_numerical_variables( + self.variables_ = _find_or_check_numerical_variables( X, self.variables ) # check for missing values - _check_contains_na(X, self.variables_numerical_) + _check_contains_na(X, self.variables_) # check for inf - _check_contains_inf(X, self.variables_numerical_) + _check_contains_inf(X, self.variables_) # instantiate pipeline self._pipeline = self._make_pipeline() @@ -182,10 +182,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: _check_X_matches_training_df(X, self.n_features_in_) # check for missing values - _check_contains_na(X, self.variables_numerical_) + _check_contains_na(X, self.variables_) # check for infinite values - _check_contains_inf(X, self.variables_numerical_) + _check_contains_inf(X, self.variables_) # discretise and encode X_tr = self._pipeline.transform(X) @@ -199,14 +199,16 @@ def _make_discretiser(self): if self.strategy == "equal_frequency": discretiser = EqualFrequencyDiscretiser( q=self.bins, - variables=self.variables_numerical_, + variables=self.variables_, return_boundaries=True, + return_object=True, ) else: discretiser = EqualWidthDiscretiser( bins=self.bins, - variables=self.variables_numerical_, - return_boundaries=True + variables=self.variables_, + return_boundaries=True, + return_object=True, ) return discretiser @@ -217,10 +219,7 @@ def _make_pipeline(self): """ pipe = Pipeline([ ("discretiser", self._make_discretiser()), - ("encoder", MeanEncoder( - variables=self.variables_numerical_, - ignore_format=True) - )] - ) + ("encoder", MeanEncoder(variables=self.variables_)) + ]) return pipe