Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TargetMeanDiscretiser: sorts variables in bins and replaces bins by target mean value #419

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a2b0c9c
initial commit
Morgan-Sell Apr 16, 2022
e517953
create fit()
Morgan-Sell Apr 16, 2022
c823f7c
update init()
Morgan-Sell Apr 16, 2022
20b902a
expand init() and fit() functionality
Morgan-Sell Apr 16, 2022
6403cf8
add functionality to fit()
Morgan-Sell Apr 16, 2022
9a0d662
create _make_discretiser()
Morgan-Sell Apr 16, 2022
de4ae94
create _make_pipeline
Morgan-Sell Apr 16, 2022
b6fac50
expand fit()
Morgan-Sell Apr 17, 2022
a2360a6
remove ArbitraryDiscretiser and correspdoning attributes
Morgan-Sell Apr 18, 2022
bf2fc62
update fit()
Morgan-Sell Apr 18, 2022
23baacb
update fit()
Morgan-Sell Apr 19, 2022
8250646
update transform() and _encode_X()
Morgan-Sell Apr 19, 2022
0ac284c
add TargetMeanDiscretiser to test_check_estimator_discretisers.py
Morgan-Sell Apr 19, 2022
265fd08
create test_target_mean_discretiser.py includes initial test
Morgan-Sell Apr 20, 2022
f576e3d
update unit tests
Morgan-Sell Apr 21, 2022
86cbbf5
edit docstring
Morgan-Sell Apr 21, 2022
20317ee
add tests
Morgan-Sell Apr 21, 2022
f676127
update fit()
Morgan-Sell May 7, 2022
c6372ba
(1) add _make_pipeline(); and (2) update fit() and transform()
Morgan-Sell May 9, 2022
d843d0e
fix style error
Morgan-Sell May 9, 2022
138b201
create unit test and fix bugs
Morgan-Sell May 9, 2022
5a229d4
create test_equal_width_strategy
Morgan-Sell May 9, 2022
82f5acc
fix errors
Morgan-Sell May 10, 2022
ddd56e5
create rst file
Morgan-Sell May 10, 2022
0a923a6
start user guide w/ demo
Morgan-Sell May 11, 2022
d278203
fix style error
Morgan-Sell May 11, 2022
1a83491
update docs/index.rst
Morgan-Sell May 11, 2022
8d7de98
update api_doc/discretisation/index.rst
Morgan-Sell May 11, 2022
cddf873
fix errors
Morgan-Sell May 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions feature_engine/discretisation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
"EqualFrequencyDiscretiser",
"EqualWidthDiscretiser",
"ArbitraryDiscretiser",
"TargetMeanDiscretiser"
]
234 changes: 234 additions & 0 deletions feature_engine/discretisation/target_mean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import warnings
from typing import Dict, List, Optional, Union

import pandas as pd

from feature_engine.discretisation.base_discretiser import BaseDiscretiser
from feature_engine._docstrings.methods import (
_fit_not_learn_docstring,
_fit_transform_docstring
)
from feature_engine._docstrings.fit_attributes import (
_variables_attribute_docstring,
_feature_names_in_docstring,
_n_features_in_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import (
_check_contains_inf,
_check_contains_na,
_check_X_matches_training_df,
check_X,
)
from feature_engine.discretisation import (
ArbitraryDiscretiser,
EqualFrequencyDiscretiser,
EqualWidthDiscretiser
)
from feature_engine.encoding import MeanEncoder
from feature_engine.tags import _return_tags
from feature_engine.variable_manipulation import (
_check_input_parameter_variables,
_find_or_check_numerical_variables,
)

from sklearn.pipeline import Pipeline

@Substitution(
return_objects=BaseDiscretiser._return_object_docstring,
return_boundaries=BaseDiscretiser._return_boundaries_docstring,
binner_dict_=BaseDiscretiser._binner_dict_docstring,
transform=BaseDiscretiser._transform_docstring,
variables=_variables_numerical_docstring,
variables_=_variables_attribute_docstring,
feature_names_in_=_feature_names_in_docstring,
n_features_in_=_n_features_in_docstring,
fit=_fit_not_learn_docstring,
fit_transform=_fit_transform_docstring,
)
class TargetMeanDiscretiser(BaseDiscretiser):
"""

Parameters
----------
strategy: str, default='equal_width'
Whether the bins should of equal width ('equal_width') or equal frequency
('equal_frequency').

{variables}

bins: int, default=10
Desired number of equal-width or equal-distance intervals / bins.

errors: string, default='ignore'
Indicates what to do when a value is outside the limits indicated in the
'binning_dict'. If 'raise', the transformation will raise an error.
If 'ignore', values outside the limits are returned as NaN
and a warning will be raised instead.

Attributes
----------
{variables_}

{binner_dict_}

{feature_names_in_}

{n_features_in_}

Methods
-------
{fit}

{fit_transform}

{transform}

See Also
--------
pandas.cut
"""

def __init__(
self,
variables: Union[None, int, str, List[Union[str, int]]] = None,
bins: int = 10,
strategy: str = "equal_frequency",
errors: str = "ignore",
) -> None:

if not isinstance(bins, int):
raise ValueError(
f"bins must be an integer. Got {bins} instead."
)
if strategy not in ("equal_frequency", "equal_width"):
raise ValueError(
"strategy must equal 'equal_frequency' or 'equal_width'. "
f"Got {strategy} instead."
)

if errors not in ("ignore", "raise"):
raise ValueError(
"errors only takes values 'ignore' and 'raise. "
f"Got {errors} instead."
)

self.variables = _check_input_parameter_variables(variables)
self.bins = bins
self.strategy = strategy
self.errors = errors

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved
"""
Learn the boundaries of the selected dicretiser's intervals / bins
for the chosen numerical variables.

Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training dataset. Can be the entire dataframe, not just the
variables to be transformed.

y : pandas series of shape = [n_samples,]
y is not needed in this discretiser. You can pass y or None.
"""
# check if 'X' is a dataframe
X = check_X(X)

Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved
# identify numerical variables
self.variables_numerical_ = _find_or_check_numerical_variables(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the attribute should be self.variables_

that should resolve the test failing

X, self.variables
)

# create dataframe to use for target values.
self.X_target_ = X[self.variables_numerical_].copy()
Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved

# check for missing values
_check_contains_na(X, self.variables_numerical_)

# check for inf
_check_contains_inf(X, self.variables_numerical_)

# discretise
self._discretiser = self._make_discretiser()
Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved
self._discretiser.fit(X)

# store input features
self.n_features_in_ = X.shape[1]
self.feature_names_in_ = list(X.columns)

return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Replace original values by the average of the target mean value per bin
for each of the variables.

Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The data to transform.

Returns
-------
X_enc: pandas dataframe of shape = [n_samples, n_features]
The transformed data with the means of the selected numerical variables.

"""
# check that fit method has been called
check_is_fitted(self)

# check that input is a dataframe
X = check_X(X)

# check that input data contain number of columns as the fitted df
_check_X_matches_training_df(X, self.n_features_in_)

# check for missing values
_check_contains_na(X, self.variables_numerical_)

# check for infinite values
_check_contains_inf(X, self.variables_numerical_)

# discretise
X_disc = self._discretiser.transform(X)

# encode
X_enc = self._encode_X(X_disc)

return X_enc

def _make_discretiser(self):
"""
Instantiate the EqualFrequencyDiscretiser or EqualWidthDiscretiser.
"""
if self.strategy == "equal_frequency":
discretiser = EqualFrequencyDiscretiser(
q=self.bins,
variables=self.variables_numerical_,
return_boundaries=True,
)
else:
discretiser = EqualWidthDiscretiser(
bins=self.bins,
variables=self.variables_numerical_,
return_boundaries=True
)

return discretiser

def _encode_X(self, X):
Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved
"""
Calculate the mean of each bin using the initial values (prior to
discretisation) for each selected variable. Replace the discrete value
(bin) with the corresponding mean.
"""
X_enc = X.copy()
X_enc[self.variables_numerical_] = X_enc[self.variables_numerical_].astype(str)

for variable in self.variables_numerical_:
encoder = MeanEncoder(variables=variable)
encoder.fit(X_enc, self.X_target_[variable])
X_enc = encoder.transform(X_enc)

return X_enc
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@
DecisionTreeDiscretiser,
EqualFrequencyDiscretiser,
EqualWidthDiscretiser,
TargetMeanDiscretiser,
)
from tests.estimator_checks.estimator_checks import check_feature_engine_estimator

_estimators = [
DecisionTreeDiscretiser(regression=False),
EqualFrequencyDiscretiser(),
EqualWidthDiscretiser(),
ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}),
ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]})
Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved
TargetMeanDiscretiser(),
]


Expand Down
74 changes: 74 additions & 0 deletions tests/test_discretisation/test_target_mean_discretiser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pandas as pd
import pytest
from sklearn.exceptions import NotFittedError
Morgan-Sell marked this conversation as resolved.
Show resolved Hide resolved

from feature_engine.discretisation import TargetMeanDiscretiser


def test_equal_frequency_automatically_find_variables_and_return_as_numeric(
df_normal_dist
):
# fit discretiser and transform dataset
transformer = TargetMeanDiscretiser(
strategy="equal_frequency", bins=10, variables=None, return_object=False
)
X = transformer.fit_transform(df_normal_dist)

# fit parameters
_, bins = pd.cut(x=df_normal_dist["var"], bins=10, retbins=True, duplicates="drop")
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")

# transform output
X_t = [x for x in range(0, 10)]

# test init params
assert transformer.bins == 10
assert transformer.variables is None
assert transformer.return_object is None
# test fit attr
assert transformer.variables_ == ["var"]
assert transformer.n_features_in_ == 1
# test transform output
assert (transformer.binner_dict_["var"] == bins).all()
assert all(x for x in X["var"].unique() if x not in X_t)
# in equal-frequency discretisation, all intervals have the same proportion of values
assert len((X["var"].value_counts()).unqiue()) == 1


def test_equal_width_automatically_find_variables_and_return_as_numeric(
df_normal_dist
):
transformer = TargetMeanDiscretiser(
strategy="equal_width", bins=10, variables=None, return_object=False
)
X = transformer.fit_transform(df_normal_dist)

# fit parameters
_, bins = pd.qcut(x=df_normal_dist["var"], q=10, retbins=True, duplicates="drop")
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")

# transform output
X_t = [x for x in range(0, 10)]
val_counts = [18, 17, 16, 13, 11, 7, 7, 5, 5, 1]

# init params
assert transformer.bins == 10
assert transformer.variables is None
assert transformer.return_object is False
# fit params
assert transformer.variables_ == ["var"]
assert transformer.n_features_in_ == 1
# transform params
assert (transformer.binner_dict_["var"] == bins).all()
assert all(x for x in X["var"].unique() if x not in X_t)
# in equal-width discretisation, intervals have number of values
assert all(x for x in ["var"].value_counts() if x not in val_counts)


@pytest.mark.parameterize("_bins", [4.2, "python", ["data", "science"]])
def test_error_when_bins_not_integer(_bins):
with pytest.raises(ValueError):
TargetMeanDiscretiser(bins=_bins)