From 220bdd773510cb8ce32359bc8bad3ccf31f9dc19 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Thu, 15 Sep 2022 15:53:36 -0400 Subject: [PATCH 01/20] Initial implementation of ordinal encoder --- evalml/pipelines/__init__.py | 1 + evalml/pipelines/components/__init__.py | 1 + .../components/transformers/__init__.py | 1 + .../transformers/encoders/__init__.py | 3 + .../transformers/encoders/onehot_encoder.py | 1 + .../transformers/encoders/ordinal_encoder.py | 220 +++++++++++++++++ .../component_tests/test_ordinal_encoder.py | 232 ++++++++++++++++++ 7 files changed, 459 insertions(+) create mode 100644 evalml/pipelines/components/transformers/encoders/ordinal_encoder.py create mode 100644 evalml/tests/component_tests/test_ordinal_encoder.py diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py index 7c8e148502..4a5b71f102 100644 --- a/evalml/pipelines/__init__.py +++ b/evalml/pipelines/__init__.py @@ -43,6 +43,7 @@ VowpalWabbitRegressor, DropNaNRowsTransformer, TimeSeriesRegularizer, + OrdinalEncoder, ) from evalml.pipelines.component_graph import ComponentGraph diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index 22d3b82179..66225776a5 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -66,6 +66,7 @@ ReplaceNullableTypes, DropNaNRowsTransformer, TimeSeriesRegularizer, + OrdinalEncoder, ) from evalml.pipelines.components.ensemble import ( StackedEnsembleClassifier, diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py index 3c9f15285d..8f97550741 100644 --- a/evalml/pipelines/components/transformers/__init__.py +++ b/evalml/pipelines/components/transformers/__init__.py @@ -4,6 +4,7 @@ OneHotEncoder, TargetEncoder, LabelEncoder, + OrdinalEncoder, ) from evalml.pipelines.components.transformers.feature_selection import ( FeatureSelector, diff --git a/evalml/pipelines/components/transformers/encoders/__init__.py b/evalml/pipelines/components/transformers/encoders/__init__.py index 03c988ce22..7584b45ce9 100644 --- a/evalml/pipelines/components/transformers/encoders/__init__.py +++ b/evalml/pipelines/components/transformers/encoders/__init__.py @@ -6,3 +6,6 @@ TargetEncoder, ) from evalml.pipelines.components.transformers.encoders.label_encoder import LabelEncoder +from evalml.pipelines.components.transformers.encoders.ordinal_encoder import ( + OrdinalEncoder, +) diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 0ac457a48b..084b172151 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -110,6 +110,7 @@ def fit(self, X, y=None): top_n = self.parameters["top_n"] X = infer_feature_types(X) if self.features_to_encode is None: + # --> should update to not include ordinals? Maybe that's configurable based on whether ordinal encoder is used? self.features_to_encode = self._get_cat_cols(X) X_t = X diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py new file mode 100644 index 0000000000..4b6c37100d --- /dev/null +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -0,0 +1,220 @@ +"""A transformer that encodes categorical features in a one-hot numeric array.""" +from pdb import set_trace + +import numpy as np +import pandas as pd +import woodwork as ww +from sklearn.preprocessing import OrdinalEncoder as SKOrdinalEncoder +from woodwork.logical_types import Ordinal + +from evalml.pipelines.components import ComponentBaseMeta +from evalml.pipelines.components.transformers.transformer import Transformer +from evalml.utils import infer_feature_types + +"""A transformer that encodes categorical features in a one-hot numeric array.""" + + +class OrdinalEncoderMeta(ComponentBaseMeta): + METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + [ + "get_feature_names", + ] + + +class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): + name = "Ordinal Encoder" + hyperparameter_ranges = {} + """{}""" + + def __init__( + self, + top_n=10, + features_to_encode=None, + categories=None, + handle_unknown="error", + unknown_value=None, + encoded_missing_value=np.nan, # --> maybe this should be np.nan since that's the utils ddefault + random_seed=0, + **kwargs, + ): + parameters = { + "top_n": top_n, + "features_to_encode": features_to_encode, + "categories": categories, # --> the cols must have their categories set - so maybe don't need this set? + "handle_unknown": handle_unknown, + "unknown_value": unknown_value, + "encoded_missing_value": encoded_missing_value, + } + parameters.update(kwargs) + + # Check correct inputs + unknown_input_options = ["use_encoded_value", "error"] + if handle_unknown not in unknown_input_options: + raise ValueError( + "Invalid input {} for handle_unknown".format(handle_unknown), + ) + if handle_unknown == "use_encoded_value" and unknown_value is None: + raise ValueError( + "To use encoded value for unknown categories, unknown_value must" + "be specified as either np.nan or as an int that is distinct from" + "the other encoded categories ", + ) + if top_n is not None and categories is not None: + raise ValueError("Cannot use categories and top_n arguments simultaneously") + + # --> add a check for encoded_missing_values is int or npnan? What about unknown value? + self.features_to_encode = features_to_encode + self._encoder = None + + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + self._initial_state = self.random_seed + self._provenance = {} + + @staticmethod + def _get_ordinal_cols(X): + """Get names of ordinal columns in the input DataFrame.""" + return list(X.ww.select(include=["ordinal"], return_schema=True).columns) + + def fit(self, X, y=None): + top_n = self.parameters["top_n"] + # --> we don't ever infer as Ordinal if theyre not set before this wont get used + X = infer_feature_types(X) + if self.features_to_encode is None: + # --> should update to not include ordinals? Maybe that's configurable based on whether ordinal encoder is used? + self.features_to_encode = self._get_ordinal_cols(X) + + X_t = X + invalid_features = [ + col for col in self.features_to_encode if col not in list(X.columns) + ] + if len(invalid_features) > 0: + # --> what if features to encode includes non ordinal cols? + raise ValueError( + "Could not find and encode {} in input data.".format( + ", ".join(invalid_features), + ), + ) + + # helper util to handle unknown ? Probs not needed bc I think the encoder can do wha twe need + # --> handle categories logic - includes topn - which means we do need to do value counts when theres more than n values + if len(self.features_to_encode) == 0: + categories = "auto" + elif self.parameters["categories"] is not None: + categories = self.parameters["categories"] + if len(categories) != len(self.features_to_encode) or not isinstance( + categories[0], + list, + ): + raise ValueError( + "Categories argument must contain a list of categories for each categorical feature", + ) + # --> should we compare with the ordinal categories to make sure they're all at least in there? + # --> if so, add a test + else: + categories = [] + ww_logical_types = X.ww.logical_types + for col in X_t[self.features_to_encode]: + ltype = ww_logical_types[col] + assert isinstance(ltype, Ordinal) + # --> if this is sampled data, the order might not be accurate? + column_categories = ltype.order + + if top_n is None or len(column_categories) <= top_n: + unique_values = column_categories + else: + value_counts = X_t[col].value_counts(dropna=False).to_frame() + # --> is it worth comparing to the column's order? maybe not + value_counts = value_counts.sample( + frac=1, + random_state=self._initial_state, + ) + value_counts = value_counts.sort_values( + [col], + ascending=False, + kind="mergesort", + ) + unique_values = value_counts.head(top_n).index.tolist() + unique_values = np.sort(unique_values) + categories.append(unique_values) + + self._encoder = SKOrdinalEncoder( + categories=categories, + handle_unknown=self.parameters["handle_unknown"], + unknown_value=self.parameters["unknown_value"], + encoded_missing_value=self.parameters["encoded_missing_value"], + ) + + self._encoder.fit(X_t[self.features_to_encode]) + # --> logic to set up input parameters? + return self + + def transform(self, X, y=None): + X = infer_feature_types(X) + + X_copy = X.ww.copy() + X = X.ww.drop(columns=self.features_to_encode) + + # Call sklearn's transform on the ordinal columns + if len(self.features_to_encode) > 0: + X_cat = pd.DataFrame( + self._encoder.transform(X_copy[self.features_to_encode]), + index=X_copy.index, + ) + X_cat.columns = self._get_feature_names() + # --> could we do Integer or IntegerNullably? Maybe but Double is simpler + X_cat.ww.init(logical_types={c: "Double" for c in X_cat.columns}) + self._feature_names = X_cat.columns + + X = ww.utils.concat_columns([X, X_cat]) + + return X + + def _get_feature_names(self): + """Return feature names for the ordinal features after fitting. + + Since ordinal encoding creates one encoded feature per column in features_to_encode, feature + names are formatted as {column_name}_ordinally_encoded --> choose a better name?? maybe one that includes how many categories were encoded + + Returns: + np.ndarray: The feature names after encoding, provided in the same order as input_features. + """ + self._features_to_drop = [] + unique_names = [] + provenance = {} + for col_name in self.features_to_encode: + encoded_name = f"{col_name}_ordinally_encoded" + unique_names.append(encoded_name) + provenance[col_name] = [encoded_name] + # --> make sure provenance should point to a list even with only one element + self._provenance = provenance + return unique_names + + def categories(self, feature_name): + # --> need to make sure this works + """Returns a list of the unique categories to be encoded for the particular feature, in order. + + Args: + feature_name (str): The name of any feature provided to one-hot encoder during fit. + + Returns: + np.ndarray: The unique categories, in the same dtype as they were provided during fit. + + Raises: + ValueError: If feature was not provided to one-hot encoder as a training feature. + """ + try: + index = self.features_to_encode.index(feature_name) + except Exception: + raise ValueError( + f'Feature "{feature_name}" was not provided to one-hot encoder as a training feature', + ) + return self._encoder.categories_[index] + + def get_feature_names(self): + return self._get_feature_names() + + def _get_feature_provenance(self): + return self._provenance diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py new file mode 100644 index 0000000000..4082f0e5b1 --- /dev/null +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -0,0 +1,232 @@ +import re + +import numpy as np +import pandas as pd +import pytest +from woodwork.logical_types import Ordinal + +from evalml.pipelines.components import OrdinalEncoder + + +def set_first_three_columns_to_ordinal_with_categories(X, categories): + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + "col_3": Ordinal(order=categories[2]), + }, + ) + return X + + +def test_init(): + parameters = { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "handle_unknown": "error", + "unknown_value": None, + "encoded_missing_value": np.nan, + } + encoder = OrdinalEncoder() + assert encoder.parameters == parameters + + +def test_parameters(): + encoder = OrdinalEncoder(top_n=123) + expected_parameters = { + "top_n": 123, + "features_to_encode": None, + "categories": None, + "handle_unknown": "error", + "unknown_value": None, + "encoded_missing_value": np.nan, + } + assert encoder.parameters == expected_parameters + + +def test_invalid_inputs(): + error_msg = "Invalid input {} for handle_unknown".format("bananas") + with pytest.raises(ValueError, match=error_msg): + encoder = OrdinalEncoder(handle_unknown="bananas") + + error_msg = ( + "To use encoded value for unknown categories, unknown_value must" + "be specified as either np.nan or as an int that is distinct from" + "the other encoded categories " + ) + with pytest.raises(ValueError, match=error_msg): + encoder = OrdinalEncoder(handle_unknown="use_encoded_value") + + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + X = set_first_three_columns_to_ordinal_with_categories( + X, + [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]], + ) + encoder = OrdinalEncoder(top_n=None, categories=[["a", "b"], ["a", "c"]]) + error_msg = "Categories argument must contain a list of categories for each categorical feature" + with pytest.raises(ValueError, match=error_msg): + encoder.fit(X) + + encoder = OrdinalEncoder(top_n=None, categories=["a", "b", "c"]) + error_msg = "Categories argument must contain a list of categories for each categorical feature" + with pytest.raises(ValueError, match=error_msg): + encoder.fit(X) + + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] + with pytest.raises( + ValueError, + match="Cannot use categories and top_n arguments simultaneously", + ): + OrdinalEncoder(top_n=11, categories=categories, random_seed=2) + + +# --> test no transformation when ordinal type not set and hyes when explicitly set +# --> test feats to encode includes non ordinals + + +def test_ohe_features_to_encode_col_missing(): + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + + encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) + + with pytest.raises(ValueError, match="Could not find and encode"): + encoder.fit(X) + + +def test_ordinal_encoder_is_no_op_for_not_ordinal_features(): + + encoder = OrdinalEncoder(handle_missing="error") + X = pd.DataFrame( + { + "col_1": [1.2, 3.2, None, 4.7], + "col_2": [4.5, 8.9, 11.2, 23.4], + "col_3": [True, False, True, True], + "col_4": [ + "a", + "b", + "a", + "c", + ], + }, + ) + X.ww.init( + logical_types={ + "col_1": "Double", + "col_2": "Integer", + "col_3": "Boolean", + "col_4": "Categorical", + }, + ) + X_t = encoder.fit_transform(X) + # --> need transform implemented for this to mean anything + pd.testing.assert_frame_equal(X_t, X) + + +def test_ordinal_encoder_recognizes_ordinal_columns(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "b", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [1, 2, 3, 4, 5], + }, + ) + encoder = OrdinalEncoder() + encoder.fit(X) + assert not encoder.features_to_encode + + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + + encoder = OrdinalEncoder() + encoder.fit(X) + assert encoder.features_to_encode == ["col_1", "col_2", "col_3"] + assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_) + + encoder = OrdinalEncoder(features_to_encode=["col_1"]) + encoder.fit(X) + assert encoder.features_to_encode == ["col_1"] + assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_) + expected_categories = [categories[0]] + for i, category_list in enumerate(encoder._encoder.categories_): + assert list(category_list) == expected_categories[i] + + +def test_ordinal_encoder_categories_set_correctly(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "b", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [1, 2, 3, 4, 5], + }, + ) + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + + encoder = OrdinalEncoder() + encoder.fit(X) + for i, category_list in enumerate(encoder._encoder.categories_): + assert list(category_list) == categories[i] + + subset_categories = [["a"], ["a"], ["a"]] + encoder = OrdinalEncoder(top_n=None, categories=subset_categories) + with pytest.raises(ValueError) as exec_info: + encoder.fit(X) + assert "Found unknown categories" in exec_info.value.args[0] + + encoder = OrdinalEncoder( + top_n=None, + categories=subset_categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + for i, category_list in enumerate(encoder._encoder.categories_): + assert list(category_list) == subset_categories[i] + + # --> feels weird that you have to supply these values when just topn is set + # --> do we need to mention tie behavior for top_n? + encoder = OrdinalEncoder( + top_n=1, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + expected_categories = [["a"], ["b"], ["a"]] + for i, category_list in enumerate(encoder._encoder.categories_): + assert list(category_list) == expected_categories[i] + + +# --> test feature names +# --> test encoded feature values +# --> null values + + +def test_ordinal_encoder_transform(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "d"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a", "b", "c", "d"]), + "col_2": Ordinal(order=["a", "b", "c"]), + "col_3": "categorical", + }, + ) + # Test NaN will be counted as a category if within the top_n + encoder = OrdinalEncoder(handle_missing="as_category") + encoder.fit(X) + X_t = encoder.transform(X) + assert False From 833a08d34525c237639825c4650c94aeb080f83b Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 16 Sep 2022 13:50:45 -0400 Subject: [PATCH 02/20] Add more tests and fix encoder behavior --- .../transformers/encoders/ordinal_encoder.py | 15 +- .../component_tests/test_one_hot_encoder.py | 1 + .../component_tests/test_ordinal_encoder.py | 410 +++++++++++++++++- 3 files changed, 407 insertions(+), 19 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 4b6c37100d..78576c5517 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -11,11 +11,12 @@ from evalml.pipelines.components.transformers.transformer import Transformer from evalml.utils import infer_feature_types -"""A transformer that encodes categorical features in a one-hot numeric array.""" +"""A transformer that encodes ordinal features.""" class OrdinalEncoderMeta(ComponentBaseMeta): METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + [ + # --> probs need to add categories back "get_feature_names", ] @@ -131,14 +132,21 @@ def fit(self, X, y=None): frac=1, random_state=self._initial_state, ) + # --> make sure this is sorting on the number value_counts = value_counts.sort_values( [col], ascending=False, kind="mergesort", ) unique_values = value_counts.head(top_n).index.tolist() - unique_values = np.sort(unique_values) - categories.append(unique_values) + + categories.append(list(unique_values)) + + # Add any null values into the categories lists so that they can get handled correctly + if isinstance(categories, list): + for i, col in enumerate(X_t[self.features_to_encode]): + if X_t[col].isna().any(): + categories[i] += [np.nan] self._encoder = SKOrdinalEncoder( categories=categories, @@ -148,7 +156,6 @@ def fit(self, X, y=None): ) self._encoder.fit(X_t[self.features_to_encode]) - # --> logic to set up input parameters? return self def transform(self, X, y=None): diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 942a0b6f0a..0f287f4c15 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -483,6 +483,7 @@ def test_more_top_n_unique_values_large(): def test_categorical_dtype(): + # --> I dont get what this is testin # test that columns with the categorical type are encoded properly X = pd.DataFrame( { diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 4082f0e5b1..fdc0f25330 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -91,7 +91,7 @@ def test_invalid_inputs(): # --> test feats to encode includes non ordinals -def test_ohe_features_to_encode_col_missing(): +def test_ordinal_encoder_col_missing(): X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) @@ -100,8 +100,7 @@ def test_ohe_features_to_encode_col_missing(): encoder.fit(X) -def test_ordinal_encoder_is_no_op_for_not_ordinal_features(): - +def test_ordinal_encoder_is_no_op_for_df_of_non_ordinal_features(): encoder = OrdinalEncoder(handle_missing="error") X = pd.DataFrame( { @@ -125,7 +124,6 @@ def test_ordinal_encoder_is_no_op_for_not_ordinal_features(): }, ) X_t = encoder.fit_transform(X) - # --> need transform implemented for this to mean anything pd.testing.assert_frame_equal(X_t, X) @@ -150,6 +148,7 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): assert encoder.features_to_encode == ["col_1", "col_2", "col_3"] assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_) + # --> this isn't really testing its ability to recognize ordinals - its testing features to encode encoder = OrdinalEncoder(features_to_encode=["col_1"]) encoder.fit(X) assert encoder.features_to_encode == ["col_1"] @@ -159,7 +158,12 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): assert list(category_list) == expected_categories[i] -def test_ordinal_encoder_categories_set_correctly(): +# --> test setting non ordinal col in features to encode + + +def test_ordinal_encoder_categories_set_correctly_from_fit(): + # The SKOrdinalEncoder.categories_ attribute is what determines what gets encoded + # So we're checking how that gets set during fit X = pd.DataFrame( { "col_1": ["a", "b", "c", "d", "a"], @@ -171,17 +175,22 @@ def test_ordinal_encoder_categories_set_correctly(): categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + # No parameters specified encoder = OrdinalEncoder() encoder.fit(X) for i, category_list in enumerate(encoder._encoder.categories_): assert list(category_list) == categories[i] - subset_categories = [["a"], ["a"], ["a"]] - encoder = OrdinalEncoder(top_n=None, categories=subset_categories) - with pytest.raises(ValueError) as exec_info: - encoder.fit(X) - assert "Found unknown categories" in exec_info.value.args[0] + # Categories set explicitly - means top_n must be set to None + # --> this behavior should be tested elsewhere?? + # encoder = OrdinalEncoder(top_n=None, categories=subset_categories) + # with pytest.raises(ValueError) as exec_info: + # encoder.fit(X) + # assert "Found unknown categories" in exec_info.value.args[0] + # Categories set at init explicitly - means we have to set top_n to None + # and handle the unknown case + subset_categories = [["a"], ["a"], ["a"]] encoder = OrdinalEncoder( top_n=None, categories=subset_categories, @@ -194,6 +203,7 @@ def test_ordinal_encoder_categories_set_correctly(): # --> feels weird that you have to supply these values when just topn is set # --> do we need to mention tie behavior for top_n? + # Categories not specified, but top_n specified to limit categories encoder = OrdinalEncoder( top_n=1, handle_unknown="use_encoded_value", @@ -207,9 +217,6 @@ def test_ordinal_encoder_categories_set_correctly(): # --> test feature names # --> test encoded feature values -# --> null values - - def test_ordinal_encoder_transform(): X = pd.DataFrame( { @@ -221,7 +228,8 @@ def test_ordinal_encoder_transform(): X.ww.init( logical_types={ "col_1": Ordinal(order=["a", "b", "c", "d"]), - "col_2": Ordinal(order=["a", "b", "c"]), + # Order is not alphabetical + "col_2": Ordinal(order=["c", "b", "a"]), "col_3": "categorical", }, ) @@ -229,4 +237,376 @@ def test_ordinal_encoder_transform(): encoder = OrdinalEncoder(handle_missing="as_category") encoder.fit(X) X_t = encoder.transform(X) - assert False + assert set(X_t.columns) == { + "col_1_ordinally_encoded", + "col_2_ordinally_encoded", + "col_3", + } + pd.testing.assert_series_equal( + X_t["col_1_ordinally_encoded"], + pd.Series([0, 1, 2, 3, 3], name="col_1_ordinally_encoded", dtype="float64"), + ) + pd.testing.assert_series_equal( + X_t["col_2_ordinally_encoded"], + pd.Series([2, 1, 2, 0, 1], name="col_2_ordinally_encoded", dtype="float64"), + ) + + +def test_null_values_in_dataframe(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": [ + "a", + "b", + "a", + "c", + "c", + ], # --> add test where one is none and the other is nan and another is pd.na + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + # Note - we cant include the null value in the categories used by Woodwork + # because it sets the pandas dtypes' categories and they can't include a null value + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + }, + ) + + # With no args set, nan doesn't get encoded into any value + encoder = OrdinalEncoder() + encoder.fit(X) + X_t = encoder.transform(X) + assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) + + # If we handle unknowns with an encoded value, the nan will be set to that value + encoder = OrdinalEncoder(encoded_missing_value=-1) + encoder.fit(X) + X_t = encoder.transform(X) + assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 + + # --> not sure that this is the desired behavior - in ohe it gets treated as its own category + # Test NaN doesn't get counted as a category to encode, so it still gets + # encoded as missing and not unknown even if it's not in the top n + X = pd.DataFrame( + { + "col_1": ["a", "a", "c", "c", np.nan], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [2, 0, 1, np.nan, 0], + }, + ) + categories = [["a", "b", "c"], ["a", "b", "c"], ["a"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + }, + ) + encoder = OrdinalEncoder( + top_n=2, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + X_t = encoder.transform(X) + assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) + assert X_t["col_2_ordinally_encoded"].iloc[3] == -1 + + # Test handle_missing='error' throws an error + # --> not currently an option - should we add? + # encoder_error = OrdinalEncoder(handle_missing="error") + + # X = pd.DataFrame( + # { + # "col_1": [np.nan, "b", "c", "d", "e", "f", "g"], + # "col_2": ["a", "c", "d", "b", "e", "e", "f"], + # "col_3": ["a", "a", "a", "a", "a", "a", "b"], + # "col_4": [2, 0, 1, 3, 0, 1, 2], + # }, + # ) + # X.ww.init(logical_types={"col_1": "categorical"}) + # with pytest.raises(ValueError, match="Input contains NaN"): + # encoder_error.fit(X) + + +def test_ordinal_encoder_diff_na_types(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": ["a", "b", "a", "c", None], + "col_3": ["a", "a", "a", "a", pd.NA], + }, + ) + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + "col_3": Ordinal(order=categories[2]), + }, + ) + encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) + encoder.fit(X) + X_t = encoder.transform(X) + # Confirm all are still null + assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) + assert pd.isna(X_t["col_2_ordinally_encoded"].iloc[-1]) + assert pd.isna(X_t["col_3_ordinally_encoded"].iloc[-1]) + + +# --> diff combinations of parameters + + +def test_handle_unknown(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + categories = [ + ["a", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b"], + ] + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + + encoder = OrdinalEncoder(handle_unknown="error") + encoder.fit(X) + assert isinstance(encoder.transform(X), pd.DataFrame) + + X = pd.DataFrame( + { + "col_1": ["x", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + categories = [ + ["x", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b"], + ] + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + with pytest.raises(ValueError) as exec_info: + # Using the encoder that was fit on data without x + encoder.transform(X) + assert "Found unknown categories" in exec_info.value.args[0] + + +# --> passed in categories have a different sorted order than that of the data itsef - use ordinal order as sourceo f truth and just inpput param as ways to specify what subset + + +def test_no_top_n(): + # test all categories in all columns are encoded when top_n is None + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"], + "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2], + }, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal( + order=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + ), + "col_2": Ordinal(order=["a", "b", "c", "d", "e", "f"]), + }, + ) + expected_col_names = set( + ["col_3", "col_4", "col_1_ordinally_encoded", "col_2_ordinally_encoded"], + ) + + encoder = OrdinalEncoder(top_n=None, handle_unknown="error", random_seed=2) + encoder.fit(X) + X_t = encoder.transform(X) + + col_names = set(X_t.columns) + assert col_names == expected_col_names + + # Make sure unknown values cause an error + X_new = pd.DataFrame( + { + "col_1": ["a", "b", "c", "x"], + "col_2": ["a", "c", "d", "b"], + "col_3": ["a", "a", "a", "a"], + "col_4": [2, 0, 1, 3], + }, + ) + X_new.ww.init( + logical_types={ + "col_1": Ordinal(order=["a", "b", "c", "x"]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + with pytest.raises(ValueError) as exec_info: + encoder.transform(X_new) + assert "Found unknown categories" in exec_info.value.args[0] + + +# def test_categories(): +# X = pd.DataFrame( +# { +# "col_1": ["a", "b", "c", "d", "e", "f", "g"], +# "col_2": ["a", "c", "d", "b", "e", "e", "f"], +# "col_3": ["a", "a", "a", "a", "a", "a", "b"], +# "col_4": [2, 0, 1, 3, 0, 1, 2], +# }, +# ) +# X = set_first_three_columns_to_categorical(X) + +# categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] + +# # test categories value works +# encoder = OrdinalEncoder(top_n=None, categories=categories, random_seed=2) +# encoder.fit(X) +# X_t = encoder.transform(X) + +# col_names = set(X_t.columns) +# expected_col_names = set( +# [ +# "col_1_a", +# "col_1_b", +# "col_1_c", +# "col_1_d", +# "col_2_a", +# "col_2_b", +# "col_2_c", +# "col_3_a", +# "col_3_b", +# "col_4", +# ], +# ) +# assert X_t.shape == (7, 10) +# assert col_names == expected_col_names + +# # test categories with top_n errors +# with pytest.raises( +# ValueError, +# match="Cannot use categories and top_n arguments simultaneously", +# ): +# encoder = OrdinalEncoder(top_n=10, categories=categories, random_seed=2) + + +# def test_less_than_top_n_unique_values(): +# # test that columns with less than n unique values encodes properly +# X = pd.DataFrame( +# { +# "col_1": ["a", "b", "c", "d", "a"], +# "col_2": ["a", "b", "a", "c", "b"], +# "col_3": ["a", "a", "a", "a", "a"], +# "col_4": [2, 0, 1, 0, 0], +# }, +# ) +# X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) +# encoder = OrdinalEncoder(top_n=5) +# encoder.fit(X) +# X_t = encoder.transform(X) +# expected_col_names = set( +# [ +# "col_1_a", +# "col_1_b", +# "col_1_c", +# "col_1_d", +# "col_2_a", +# "col_2_b", +# "col_2_c", +# "col_3_a", +# "col_4", +# ], +# ) +# col_names = set(X_t.columns) +# assert col_names == expected_col_names + + +# def test_more_top_n_unique_values(): +# # test that columns with >= n unique values encodes properly +# X = pd.DataFrame( +# { +# "col_1": ["a", "b", "c", "d", "e", "f", "g"], +# "col_2": ["a", "c", "d", "b", "e", "e", "f"], +# "col_3": ["a", "a", "a", "a", "a", "a", "b"], +# "col_4": [2, 0, 1, 3, 0, 1, 2], +# }, +# ) +# X = set_first_three_columns_to_categorical(X) + +# random_seed = 2 + +# encoder = OrdinalEncoder(top_n=5, random_seed=random_seed) +# encoder.fit(X) +# X_t = encoder.transform(X) + +# # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too +# X = infer_feature_types(X) +# col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() +# col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) +# col_1_counts = col_1_counts.sort_values( +# ["col_1"], +# ascending=False, +# kind="mergesort", +# ) +# col_1_samples = col_1_counts.head(encoder.parameters["top_n"]).index.tolist() + +# col_2_counts = X["col_2"].value_counts(dropna=False).to_frame() +# col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed) +# col_2_counts = col_2_counts.sort_values( +# ["col_2"], +# ascending=False, +# kind="mergesort", +# ) +# col_2_samples = col_2_counts.head(encoder.parameters["top_n"]).index.tolist() + +# expected_col_names = set(["col_2_e", "col_3_b", "col_4"]) +# for val in col_1_samples: +# expected_col_names.add("col_1_" + val) +# for val in col_2_samples: +# expected_col_names.add("col_2_" + val) + +# col_names = set(X_t.columns) +# assert col_names == expected_col_names + + +# def test_more_top_n_unique_values_large(): +# X = pd.DataFrame( +# { +# "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], +# "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], +# "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], +# "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1], +# }, +# ) +# X = set_first_three_columns_to_categorical(X) +# random_seed = 2 + +# encoder = OrdinalEncoder(top_n=3, random_seed=random_seed) +# encoder.fit(X) +# X_t = encoder.transform(X) + +# # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too +# X = infer_feature_types(X) +# col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() +# col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) +# col_1_counts = col_1_counts.sort_values( +# ["col_1"], +# ascending=False, +# kind="mergesort", +# ) +# col_1_samples = col_1_counts.head(encoder.parameters["top_n"]).index.tolist() +# expected_col_names = set( +# ["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"], +# ) +# for val in col_1_samples: +# expected_col_names.add("col_1_" + val) + +# col_names = set(X_t.columns) +# assert col_names == expected_col_names From fec0cb0c445fd4288250e81046ef08b4c21f28b0 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 16 Sep 2022 14:40:17 -0400 Subject: [PATCH 03/20] More tests and use ltype order as source of truth --- .../transformers/encoders/ordinal_encoder.py | 18 +- .../component_tests/test_ordinal_encoder.py | 174 +++++++++++------- 2 files changed, 122 insertions(+), 70 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 78576c5517..8d7c8a223b 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -104,14 +104,26 @@ def fit(self, X, y=None): if len(self.features_to_encode) == 0: categories = "auto" elif self.parameters["categories"] is not None: - categories = self.parameters["categories"] - if len(categories) != len(self.features_to_encode) or not isinstance( - categories[0], + input_categories = self.parameters["categories"] + if len(input_categories) != len(self.features_to_encode) or not isinstance( + input_categories[0], list, ): raise ValueError( "Categories argument must contain a list of categories for each categorical feature", ) + + # Categories should be in the same order as the data's Ordinal.order categories + # even if it's a subset + # --> refactor this to be nicer + categories = [] + for i, col_categories in enumerate(input_categories): + categories_order = X.ww.logical_types[X.columns[i]].order + ordered_categories = [ + cat for cat in categories_order if cat in col_categories + ] + categories.append(ordered_categories) + # --> should we compare with the ordinal categories to make sure they're all at least in there? # --> if so, add a test else: diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index fdc0f25330..45144f5d58 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -453,79 +453,119 @@ def test_no_top_n(): assert "Found unknown categories" in exec_info.value.args[0] -# def test_categories(): -# X = pd.DataFrame( -# { -# "col_1": ["a", "b", "c", "d", "e", "f", "g"], -# "col_2": ["a", "c", "d", "b", "e", "e", "f"], -# "col_3": ["a", "a", "a", "a", "a", "a", "b"], -# "col_4": [2, 0, 1, 3, 0, 1, 2], -# }, -# ) -# X = set_first_three_columns_to_categorical(X) +def test_categories_set_at_init(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + full_categories = [ + ["a", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b"], + ] + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) -# categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] -# # test categories value works -# encoder = OrdinalEncoder(top_n=None, categories=categories, random_seed=2) -# encoder.fit(X) -# X_t = encoder.transform(X) + # test categories value works when transforming + encoder = OrdinalEncoder( + top_n=None, + categories=categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + random_seed=2, + ) + encoder.fit(X) + X_t = encoder.transform(X) -# col_names = set(X_t.columns) -# expected_col_names = set( -# [ -# "col_1_a", -# "col_1_b", -# "col_1_c", -# "col_1_d", -# "col_2_a", -# "col_2_b", -# "col_2_c", -# "col_3_a", -# "col_3_b", -# "col_4", -# ], -# ) -# assert X_t.shape == (7, 10) -# assert col_names == expected_col_names + assert list(X_t["col_1_ordinally_encoded"]) == [0, 1, 2, 3, -1, -1, -1] + assert list(X_t["col_2_ordinally_encoded"]) == [0, 2, -1, 1, -1, -1, -1] + assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] -# # test categories with top_n errors -# with pytest.raises( -# ValueError, -# match="Cannot use categories and top_n arguments simultaneously", -# ): -# encoder = OrdinalEncoder(top_n=10, categories=categories, random_seed=2) +def test_categories_different_order_from_ltype(): + # The order of categories comes from the Ordinal.order property of the data. + # Categories passed in as input to the encoder just determine what subset should + # be used. + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + full_categories = [ + ["a", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b"], + ] + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) -# def test_less_than_top_n_unique_values(): -# # test that columns with less than n unique values encodes properly -# X = pd.DataFrame( -# { -# "col_1": ["a", "b", "c", "d", "a"], -# "col_2": ["a", "b", "a", "c", "b"], -# "col_3": ["a", "a", "a", "a", "a"], -# "col_4": [2, 0, 1, 0, 0], -# }, -# ) -# X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) -# encoder = OrdinalEncoder(top_n=5) -# encoder.fit(X) -# X_t = encoder.transform(X) -# expected_col_names = set( -# [ -# "col_1_a", -# "col_1_b", -# "col_1_c", -# "col_1_d", -# "col_2_a", -# "col_2_b", -# "col_2_c", -# "col_3_a", -# "col_4", -# ], -# ) -# col_names = set(X_t.columns) -# assert col_names == expected_col_names + # The order doesn't match the full categories above but outputted data will still match above + categories = [["d", "a", "c", "b"], ["c", "b", "a"], ["b", "a"]] + + # test categories value works when transforming + encoder = OrdinalEncoder( + top_n=None, + categories=categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + random_seed=2, + ) + encoder.fit(X) + X_t = encoder.transform(X) + + assert list(X_t["col_1_ordinally_encoded"]) == [0, 1, 2, 3, -1, -1, -1] + assert list(X_t["col_2_ordinally_encoded"]) == [0, 2, -1, 1, -1, -1, -1] + assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] + + +def test_less_than_top_n_unique_values(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "d"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [2, 0, 1, 0, 0], + }, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a", "b", "c", "d"]), + "col_2": Ordinal(order=["c", "b", "a"]), + "col_3": "categorical", + }, + ) + + encoder = OrdinalEncoder(top_n=5) + encoder.fit(X) + X_t = encoder.transform(X) + + assert set(X_t.columns) == { + "col_1_ordinally_encoded", + "col_2_ordinally_encoded", + "col_3", + "col_4", + } + pd.testing.assert_series_equal( + X_t["col_1_ordinally_encoded"], + pd.Series([0, 1, 2, 3, 3], name="col_1_ordinally_encoded", dtype="float64"), + ) + pd.testing.assert_series_equal( + X_t["col_2_ordinally_encoded"], + pd.Series([2, 1, 2, 0, 1], name="col_2_ordinally_encoded", dtype="float64"), + ) # def test_more_top_n_unique_values(): From 4d3bd08cb9401322ab2268512273df04dadfab9d Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Tue, 20 Sep 2022 13:48:19 -0400 Subject: [PATCH 04/20] add more tests and make sure categories order matches ordinal ltype's --- .../transformers/encoders/ordinal_encoder.py | 10 +- .../component_tests/test_ordinal_encoder.py | 335 ++++++++++++++---- 2 files changed, 283 insertions(+), 62 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 8d7c8a223b..f41b639250 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -140,6 +140,7 @@ def fit(self, X, y=None): else: value_counts = X_t[col].value_counts(dropna=False).to_frame() # --> is it worth comparing to the column's order? maybe not + # --> i assume randomness here is so that we aren' always using the same columns? value_counts = value_counts.sample( frac=1, random_state=self._initial_state, @@ -152,7 +153,14 @@ def fit(self, X, y=None): ) unique_values = value_counts.head(top_n).index.tolist() - categories.append(list(unique_values)) + # Categories should be in the same order as the data's Ordinal.order categories + categories_order = X.ww.logical_types[col].order + unique_values_in_order = [ + cat for cat in categories_order if cat in unique_values + ] + categories.append(unique_values_in_order) + + # Categories should be in the same order as the data's Ordinal.order categories # Add any null values into the categories lists so that they can get handled correctly if isinstance(categories, list): diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 45144f5d58..799f261776 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -5,6 +5,7 @@ import pytest from woodwork.logical_types import Ordinal +from evalml.exceptions import ComponentNotYetFittedError from evalml.pipelines.components import OrdinalEncoder @@ -233,7 +234,6 @@ def test_ordinal_encoder_transform(): "col_3": "categorical", }, ) - # Test NaN will be counted as a category if within the top_n encoder = OrdinalEncoder(handle_missing="as_category") encoder.fit(X) X_t = encoder.transform(X) @@ -568,85 +568,298 @@ def test_less_than_top_n_unique_values(): ) -# def test_more_top_n_unique_values(): -# # test that columns with >= n unique values encodes properly +def test_more_top_n_unique_values(): + # test that columns with >= n unique values encodes properly + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + full_categories = [ + ["a", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b"], + ] + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) + + random_seed = 2 + + encoder = OrdinalEncoder( + top_n=5, + random_seed=random_seed, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + X_t = encoder.transform(X) + + # With random seed, selected categories are e, b, d, c, g + assert list(X_t["col_1_ordinally_encoded"]) == [-1, 0, 1, 2, 3, -1, 4] + assert list(X_t["col_2_ordinally_encoded"]) == [0, 2, 3, 1, 4, 4, -1] + assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] + + +def test_numpy_input(): + X = np.array([[2, 0, 1, 0, 0], [3, 2, 5, 1, 3]]) + encoder = OrdinalEncoder() + encoder.fit(X) + X_t = encoder.transform(X) + pd.testing.assert_frame_equal(pd.DataFrame(X), X_t, check_dtype=False) + + +@pytest.mark.parametrize( + "index", + [ + list(range(-5, 0)), + list(range(100, 105)), + [f"row_{i}" for i in range(5)], + pd.date_range("2020-09-08", periods=5), + ], +) +def test_ordinal_encoder_preserves_custom_index(index): + df = pd.DataFrame( + {"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)}, + index=index, + ) + encoder = OrdinalEncoder() + new_df = encoder.fit_transform(df) + pd.testing.assert_index_equal(new_df.index, df.index) + assert not new_df.isna().any(axis=None) + + +# def test_ordinal_encoder_categories(): # X = pd.DataFrame( -# { -# "col_1": ["a", "b", "c", "d", "e", "f", "g"], -# "col_2": ["a", "c", "d", "b", "e", "e", "f"], -# "col_3": ["a", "a", "a", "a", "a", "a", "b"], -# "col_4": [2, 0, 1, 3, 0, 1, 2], -# }, +# {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, # ) -# X = set_first_three_columns_to_categorical(X) +# X.ww.init(logical_types={"col_2": "categorical"}) +# encoder = OrdinalEncoder(top_n=2) +# with pytest.raises( +# ComponentNotYetFittedError, +# match="This OrdinalEncoder is not fitted yet. You must fit OrdinalEncoder before calling categories.", +# ): +# encoder.categories("col_1") -# random_seed = 2 +# encoder.fit(X) +# np.testing.assert_array_equal(encoder.categories("col_1"), np.array(["a"])) +# np.testing.assert_array_equal(encoder.categories("col_2"), np.array(["a", "b"])) +# with pytest.raises( +# ValueError, +# match='Feature "col_12345" was not provided to ordinal encoder as a training feature', +# ): +# encoder.categories("col_12345") -# encoder = OrdinalEncoder(top_n=5, random_seed=random_seed) + +# def test_ohe_get_feature_names(): +# X = pd.DataFrame( +# {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, +# ) +# X.ww.init(logical_types={"col_2": "categorical"}) +# ohe = OneHotEncoder(top_n=2) +# with pytest.raises( +# ComponentNotYetFittedError, +# match="This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling get_feature_names.", +# ): +# ohe.get_feature_names() +# ohe.fit(X) +# np.testing.assert_array_equal( +# ohe.get_feature_names(), +# np.array(["col_1_a", "col_2_a", "col_2_b"]), +# ) + +# X = pd.DataFrame({"col_1": ["a"] * 4 + ["b"] * 6, "col_2": ["b"] * 3 + ["c"] * 7}) +# ohe = OneHotEncoder(drop="if_binary") +# ohe.fit(X) +# np.testing.assert_array_equal( +# ohe.get_feature_names(), +# np.array(["col_1_a", "col_2_b"]), +# ) + + +# def test_ohe_features_to_encode(): +# # Test feature that doesn't need encoding and +# # feature that needs encoding but is not specified remain untouched +# X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + +# encoder = OneHotEncoder(top_n=5, features_to_encode=["col_1"]) # encoder.fit(X) # X_t = encoder.transform(X) +# expected_col_names = set(["col_1_0", "col_1_1", "col_1_2", "col_2"]) +# col_names = set(X_t.columns) +# assert col_names == expected_col_names +# assert [X_t[col].dtype == "uint8" for col in X_t] -# # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too -# X = infer_feature_types(X) -# col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() -# col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) -# col_1_counts = col_1_counts.sort_values( -# ["col_1"], -# ascending=False, -# kind="mergesort", -# ) -# col_1_samples = col_1_counts.head(encoder.parameters["top_n"]).index.tolist() - -# col_2_counts = X["col_2"].value_counts(dropna=False).to_frame() -# col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed) -# col_2_counts = col_2_counts.sort_values( -# ["col_2"], -# ascending=False, -# kind="mergesort", +# encoder = OneHotEncoder(top_n=5, features_to_encode=["col_1", "col_2"]) +# encoder.fit(X) +# X_t = encoder.transform(X) +# expected_col_names = set( +# ["col_1_0", "col_1_1", "col_1_2", "col_2_a", "col_2_b", "col_2_c", "col_2_d"], # ) -# col_2_samples = col_2_counts.head(encoder.parameters["top_n"]).index.tolist() +# col_names = set(X_t.columns) +# assert col_names == expected_col_names +# assert [X_t[col].dtype == "uint8" for col in X_t] -# expected_col_names = set(["col_2_e", "col_3_b", "col_4"]) -# for val in col_1_samples: -# expected_col_names.add("col_1_" + val) -# for val in col_2_samples: -# expected_col_names.add("col_2_" + val) +# def test_ohe_features_to_encode_col_missing(): +# X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + +# encoder = OneHotEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) + +# with pytest.raises(ValueError, match="Could not find and encode"): +# encoder.fit(X) + + +# def test_ohe_features_to_encode_no_col_names(): +# X = pd.DataFrame([["b", 0], ["a", 1], ["b", 1]]) +# encoder = OneHotEncoder(top_n=5, features_to_encode=[0]) +# encoder.fit(X) +# X_t = encoder.transform(X) +# expected_col_names = set([1, "0_a"]) # col_names = set(X_t.columns) # assert col_names == expected_col_names +# assert [X_t[col].dtype == "uint8" for col in X_t] -# def test_more_top_n_unique_values_large(): -# X = pd.DataFrame( +# def test_ohe_top_n_categories_always_the_same(): +# df = pd.DataFrame( # { -# "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], -# "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], -# "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], -# "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1], +# "categories": ["cat_1"] * 5 +# + ["cat_2"] * 4 +# + ["cat_3"] * 3 +# + ["cat_4"] * 3 +# + ["cat_5"] * 3, +# "numbers": range(18), # }, # ) -# X = set_first_three_columns_to_categorical(X) -# random_seed = 2 -# encoder = OrdinalEncoder(top_n=3, random_seed=random_seed) -# encoder.fit(X) -# X_t = encoder.transform(X) +# def check_df_equality(random_seed): +# ohe = OneHotEncoder(top_n=4, random_seed=random_seed) +# df1 = ohe.fit_transform(df) +# df2 = ohe.fit_transform(df) +# assert_frame_equal(df1, df2) + +# check_df_equality(5) +# check_df_equality(get_random_seed(5)) -# # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too -# X = infer_feature_types(X) -# col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() -# col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) -# col_1_counts = col_1_counts.sort_values( -# ["col_1"], -# ascending=False, -# kind="mergesort", + +# def test_ohe_column_names_unique(): +# df = pd.DataFrame({"A": ["x_y"], "A_x": ["y"]}) +# df.ww.init(logical_types={"A": "categorical", "A_x": "categorical"}) +# df_transformed = OneHotEncoder().fit_transform(df) +# assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1"} + +# df = pd.DataFrame( +# { +# "A": ["x_y", "z", "z"], +# "A_x": [ +# "y", +# "a", +# "a", +# ], +# "A_x_y": ["1", "y", "y"], +# }, # ) -# col_1_samples = col_1_counts.head(encoder.parameters["top_n"]).index.tolist() -# expected_col_names = set( -# ["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"], +# df.ww.init( +# logical_types={ +# "A": "categorical", +# "A_x": "categorical", +# "A_x_y": "categorical", +# }, # ) -# for val in col_1_samples: -# expected_col_names.add("col_1_" + val) +# df_transformed = OneHotEncoder().fit_transform(df) +# # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists +# # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists +# assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1", "A_x_y_1_1"} -# col_names = set(X_t.columns) -# assert col_names == expected_col_names +# df = pd.DataFrame( +# {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]}, +# ) +# df.ww.init( +# logical_types={ +# "A": "categorical", +# "A_x": "categorical", +# "A_x_y": "categorical", +# }, +# ) +# df_transformed = OneHotEncoder().fit_transform(df) +# # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists +# # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists +# # category 1 in A_x_y gets mapped to A_x_y_1_2 because A_x_y_1_1 already exists +# assert set(df_transformed.columns) == { +# "A_x_y", +# "A_z", +# "A_a", +# "A_x_y_1", +# "A_x_y_1_1", +# "A_x_b", +# "A_x_y_1_2", +# "A_x_y_y", +# "A_x_y_c", +# } + + +# @pytest.mark.parametrize( +# "X_df", +# [ +# pd.DataFrame( +# pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d"), +# ), +# pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), +# pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), +# pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), +# pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), +# pd.DataFrame( +# pd.Series( +# ["this will be a natural language column because length", "yay", "hay"], +# dtype="string", +# ), +# ), +# ], +# ) +# def test_ohe_woodwork_custom_overrides_returned_by_components(X_df): +# y = pd.Series([1, 2, 1]) +# override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean] +# for logical_type in override_types: +# try: +# X = X_df.copy() +# X.ww.init(logical_types={0: logical_type}) +# except (TypeConversionError, ValueError, TypeError): +# continue +# if X.loc[:, 0].isna().all(): +# # Casting the fourth and fifth dataframes to datetime will produce all NaNs +# continue + +# ohe = OneHotEncoder() +# ohe.fit(X, y) +# transformed = ohe.transform(X, y) +# assert isinstance(transformed, pd.DataFrame) +# if logical_type != Categorical: +# assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { +# 0: logical_type, +# } + + +# def test_ohe_output_bools(): +# X = pd.DataFrame( +# { +# "bool": [bool(i % 2) for i in range(100)], +# "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, +# "integers": [i for i in range(100)], +# }, +# ) +# X.ww.init() +# y = pd.Series([i % 2 for i in range(100)]) +# y.ww.init() +# ohe = OneHotEncoder() +# output = ohe.fit_transform(X, y) +# for name, types in output.ww.types["Logical Type"].items(): +# if name == "integers": +# assert str(types) == "Integer" +# else: +# assert str(types) == "Boolean" +# assert len(output.columns) == 5 From 1aa2e9781ec69701520ba4b4527d4268671be3ee Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Tue, 20 Sep 2022 14:42:09 -0400 Subject: [PATCH 05/20] Add remaining tests --- .../transformers/encoders/ordinal_encoder.py | 8 +- .../component_tests/test_ordinal_encoder.py | 383 +++++++++--------- 2 files changed, 195 insertions(+), 196 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index f41b639250..8ab7ef6817 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -16,7 +16,7 @@ class OrdinalEncoderMeta(ComponentBaseMeta): METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + [ - # --> probs need to add categories back + "categories", "get_feature_names", ] @@ -224,19 +224,19 @@ def categories(self, feature_name): """Returns a list of the unique categories to be encoded for the particular feature, in order. Args: - feature_name (str): The name of any feature provided to one-hot encoder during fit. + feature_name (str): The name of any feature provided to ordinal encoder during fit. Returns: np.ndarray: The unique categories, in the same dtype as they were provided during fit. Raises: - ValueError: If feature was not provided to one-hot encoder as a training feature. + ValueError: If feature was not provided to ordinal encoder as a training feature. """ try: index = self.features_to_encode.index(feature_name) except Exception: raise ValueError( - f'Feature "{feature_name}" was not provided to one-hot encoder as a training feature', + f'Feature "{feature_name}" was not provided to ordinal encoder as a training feature', ) return self._encoder.categories_[index] diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 799f261776..d1ff0bd605 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -7,6 +7,7 @@ from evalml.exceptions import ComponentNotYetFittedError from evalml.pipelines.components import OrdinalEncoder +from evalml.utils import get_random_seed def set_first_three_columns_to_ordinal_with_categories(X, categories): @@ -288,7 +289,7 @@ def test_null_values_in_dataframe(): X_t = encoder.transform(X) assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 - # --> not sure that this is the desired behavior - in ohe it gets treated as its own category + # --> not sure that this is the desired behavior - in ordinal_encoder it gets treated as its own category # Test NaN doesn't get counted as a category to encode, so it still gets # encoded as missing and not unknown even if it's not in the top n X = pd.DataFrame( @@ -633,174 +634,165 @@ def test_ordinal_encoder_preserves_custom_index(index): assert not new_df.isna().any(axis=None) -# def test_ordinal_encoder_categories(): -# X = pd.DataFrame( -# {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, -# ) -# X.ww.init(logical_types={"col_2": "categorical"}) -# encoder = OrdinalEncoder(top_n=2) -# with pytest.raises( -# ComponentNotYetFittedError, -# match="This OrdinalEncoder is not fitted yet. You must fit OrdinalEncoder before calling categories.", -# ): -# encoder.categories("col_1") - -# encoder.fit(X) -# np.testing.assert_array_equal(encoder.categories("col_1"), np.array(["a"])) -# np.testing.assert_array_equal(encoder.categories("col_2"), np.array(["a", "b"])) -# with pytest.raises( -# ValueError, -# match='Feature "col_12345" was not provided to ordinal encoder as a training feature', -# ): -# encoder.categories("col_12345") - - -# def test_ohe_get_feature_names(): -# X = pd.DataFrame( -# {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, -# ) -# X.ww.init(logical_types={"col_2": "categorical"}) -# ohe = OneHotEncoder(top_n=2) -# with pytest.raises( -# ComponentNotYetFittedError, -# match="This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling get_feature_names.", -# ): -# ohe.get_feature_names() -# ohe.fit(X) -# np.testing.assert_array_equal( -# ohe.get_feature_names(), -# np.array(["col_1_a", "col_2_a", "col_2_b"]), -# ) - -# X = pd.DataFrame({"col_1": ["a"] * 4 + ["b"] * 6, "col_2": ["b"] * 3 + ["c"] * 7}) -# ohe = OneHotEncoder(drop="if_binary") -# ohe.fit(X) -# np.testing.assert_array_equal( -# ohe.get_feature_names(), -# np.array(["col_1_a", "col_2_b"]), -# ) - - -# def test_ohe_features_to_encode(): -# # Test feature that doesn't need encoding and -# # feature that needs encoding but is not specified remain untouched -# X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) - -# encoder = OneHotEncoder(top_n=5, features_to_encode=["col_1"]) -# encoder.fit(X) -# X_t = encoder.transform(X) -# expected_col_names = set(["col_1_0", "col_1_1", "col_1_2", "col_2"]) -# col_names = set(X_t.columns) -# assert col_names == expected_col_names -# assert [X_t[col].dtype == "uint8" for col in X_t] - -# encoder = OneHotEncoder(top_n=5, features_to_encode=["col_1", "col_2"]) -# encoder.fit(X) -# X_t = encoder.transform(X) -# expected_col_names = set( -# ["col_1_0", "col_1_1", "col_1_2", "col_2_a", "col_2_b", "col_2_c", "col_2_d"], -# ) -# col_names = set(X_t.columns) -# assert col_names == expected_col_names -# assert [X_t[col].dtype == "uint8" for col in X_t] - - -# def test_ohe_features_to_encode_col_missing(): -# X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) - -# encoder = OneHotEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) - -# with pytest.raises(ValueError, match="Could not find and encode"): -# encoder.fit(X) - - -# def test_ohe_features_to_encode_no_col_names(): -# X = pd.DataFrame([["b", 0], ["a", 1], ["b", 1]]) -# encoder = OneHotEncoder(top_n=5, features_to_encode=[0]) -# encoder.fit(X) -# X_t = encoder.transform(X) -# expected_col_names = set([1, "0_a"]) -# col_names = set(X_t.columns) -# assert col_names == expected_col_names -# assert [X_t[col].dtype == "uint8" for col in X_t] - - -# def test_ohe_top_n_categories_always_the_same(): -# df = pd.DataFrame( -# { -# "categories": ["cat_1"] * 5 -# + ["cat_2"] * 4 -# + ["cat_3"] * 3 -# + ["cat_4"] * 3 -# + ["cat_5"] * 3, -# "numbers": range(18), -# }, -# ) - -# def check_df_equality(random_seed): -# ohe = OneHotEncoder(top_n=4, random_seed=random_seed) -# df1 = ohe.fit_transform(df) -# df2 = ohe.fit_transform(df) -# assert_frame_equal(df1, df2) - -# check_df_equality(5) -# check_df_equality(get_random_seed(5)) - - -# def test_ohe_column_names_unique(): -# df = pd.DataFrame({"A": ["x_y"], "A_x": ["y"]}) -# df.ww.init(logical_types={"A": "categorical", "A_x": "categorical"}) -# df_transformed = OneHotEncoder().fit_transform(df) -# assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1"} - -# df = pd.DataFrame( -# { -# "A": ["x_y", "z", "z"], -# "A_x": [ -# "y", -# "a", -# "a", -# ], -# "A_x_y": ["1", "y", "y"], -# }, -# ) -# df.ww.init( -# logical_types={ -# "A": "categorical", -# "A_x": "categorical", -# "A_x_y": "categorical", -# }, -# ) -# df_transformed = OneHotEncoder().fit_transform(df) -# # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists -# # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists -# assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1", "A_x_y_1_1"} - -# df = pd.DataFrame( -# {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]}, -# ) -# df.ww.init( -# logical_types={ -# "A": "categorical", -# "A_x": "categorical", -# "A_x_y": "categorical", -# }, -# ) -# df_transformed = OneHotEncoder().fit_transform(df) -# # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists -# # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists -# # category 1 in A_x_y gets mapped to A_x_y_1_2 because A_x_y_1_1 already exists -# assert set(df_transformed.columns) == { -# "A_x_y", -# "A_z", -# "A_a", -# "A_x_y_1", -# "A_x_y_1_1", -# "A_x_b", -# "A_x_y_1_2", -# "A_x_y_y", -# "A_x_y_c", -# } +def test_ordinal_encoder_categories(): + X = pd.DataFrame( + {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a"]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + encoder = OrdinalEncoder( + top_n=2, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + with pytest.raises( + ComponentNotYetFittedError, + match="This OrdinalEncoder is not fitted yet. You must fit OrdinalEncoder before calling categories.", + ): + encoder.categories("col_1") + + encoder.fit(X) + np.testing.assert_array_equal(encoder.categories("col_1"), np.array(["a"])) + np.testing.assert_array_equal(encoder.categories("col_2"), np.array(["a", "b"])) + with pytest.raises( + ValueError, + match='Feature "col_12345" was not provided to ordinal encoder as a training feature', + ): + encoder.categories("col_12345") + + +def test_ordinal_encoder_get_feature_names(): + X = pd.DataFrame( + {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a"]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + + ordinal_encoder = OrdinalEncoder( + top_n=2, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + with pytest.raises( + ComponentNotYetFittedError, + match="This OrdinalEncoder is not fitted yet. You must fit OrdinalEncoder before calling get_feature_names.", + ): + ordinal_encoder.get_feature_names() + ordinal_encoder.fit(X) + np.testing.assert_array_equal( + ordinal_encoder.get_feature_names(), + np.array(["col_1_ordinally_encoded", "col_2_ordinally_encoded"]), + ) + + ordinal_encoder = OrdinalEncoder(features_to_encode=["col_2"]) + ordinal_encoder.fit(X) + np.testing.assert_array_equal( + ordinal_encoder.get_feature_names(), + np.array(["col_2_ordinally_encoded"]), + ) + + +def test_ordinal_encoder_features_to_encode(): + # Test feature that doesn't need encoding and + # feature that needs encoding but is not specified remain untouched + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=[0, 1, 2]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + + encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_1"]) + encoder.fit(X) + X_t = encoder.transform(X) + expected_col_names = set(["col_2", "col_1_ordinally_encoded"]) + col_names = set(X_t.columns) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] + + encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_1", "col_2"]) + encoder.fit(X) + X_t = encoder.transform(X) + expected_col_names = set( + ["col_1_ordinally_encoded", "col_2_ordinally_encoded"], + ) + col_names = set(X_t.columns) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] + + +def test_ordinal_encoder_features_to_encode_col_missing(): + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=[0, 1, 2]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + + encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) + + with pytest.raises(ValueError, match="Could not find and encode"): + encoder.fit(X) + + +def test_ordinal_encoder_features_to_encode_no_col_names(): + X = pd.DataFrame([["b", 0], ["a", 1], ["b", 1]]) + X.ww.init( + logical_types={ + 0: Ordinal(order=["b", "a"]), + 1: Ordinal(order=[0, 1]), + }, + ) + encoder = OrdinalEncoder(top_n=5, features_to_encode=[0]) + encoder.fit(X) + X_t = encoder.transform(X) + expected_col_names = set([1, "0_ordinally_encoded"]) + col_names = set(X_t.columns) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] + + +def test_ordinal_encoder_top_n_categories_always_the_same_when_no_ties(): + df = pd.DataFrame( + { + "categories": ["cat_1"] * 5 + + ["cat_2"] * 4 + + ["cat_3"] * 3 + + ["cat_4"] * 3 + + ["cat_5"] * 3, + "numbers": range(18), + }, + ) + df.ww.init( + logical_types={ + "categories": Ordinal(order=["cat_1", "cat_2", "cat_3", "cat_4", "cat_5"]), + }, + ) + + def check_df_equality(random_seed): + ordinal_encoder = OrdinalEncoder( + top_n=4, + random_seed=random_seed, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + + df1 = ordinal_encoder.fit_transform(df) + df2 = ordinal_encoder.fit_transform(df) + pd.testing.assert_frame_equal(df1, df2) + + check_df_equality(5) + check_df_equality(get_random_seed(5)) # @pytest.mark.parametrize( @@ -821,7 +813,8 @@ def test_ordinal_encoder_preserves_custom_index(index): # ), # ], # ) -# def test_ohe_woodwork_custom_overrides_returned_by_components(X_df): +# def test_ordinal_encoder_woodwork_custom_overrides_returned_by_components(X_df): +# --> what's the point of adding this?? # y = pd.Series([1, 2, 1]) # override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean] # for logical_type in override_types: @@ -834,9 +827,9 @@ def test_ordinal_encoder_preserves_custom_index(index): # # Casting the fourth and fifth dataframes to datetime will produce all NaNs # continue -# ohe = OneHotEncoder() -# ohe.fit(X, y) -# transformed = ohe.transform(X, y) +# ordinal_encoder = OrdinalEncoder() +# ordinal_encoder.fit(X, y) +# transformed = ordinal_encoder.transform(X, y) # assert isinstance(transformed, pd.DataFrame) # if logical_type != Categorical: # assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { @@ -844,22 +837,28 @@ def test_ordinal_encoder_preserves_custom_index(index): # } -# def test_ohe_output_bools(): -# X = pd.DataFrame( -# { -# "bool": [bool(i % 2) for i in range(100)], -# "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, -# "integers": [i for i in range(100)], -# }, -# ) -# X.ww.init() -# y = pd.Series([i % 2 for i in range(100)]) -# y.ww.init() -# ohe = OneHotEncoder() -# output = ohe.fit_transform(X, y) -# for name, types in output.ww.types["Logical Type"].items(): -# if name == "integers": -# assert str(types) == "Integer" -# else: -# assert str(types) == "Boolean" -# assert len(output.columns) == 5 +def test_ordinal_encoder_output_doubles(): + X = pd.DataFrame( + { + "bool": [bool(i % 2) for i in range(100)], + "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, + "integers": [i for i in range(100)], + }, + ) + X.ww.init( + logical_types={ + "categorical": Ordinal(order=["dog", "cat", "fish"]), + }, + ) + y = pd.Series([i % 2 for i in range(100)]) + y.ww.init() + ordinal_encoder = OrdinalEncoder() + output = ordinal_encoder.fit_transform(X, y) + for name, types in output.ww.types["Logical Type"].items(): + if name == "integers": + assert str(types) == "Integer" + elif name == "bool": + assert str(types) == "Boolean" + else: + assert str(types) == "Double" + assert len(output.columns) == len(X.columns) From becaf898c57514aa89f3ae346e6e61e2f10a2bbd Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Tue, 20 Sep 2022 16:26:40 -0400 Subject: [PATCH 06/20] record which tests weren't converted --- .../component_tests/test_ordinal_encoder.py | 67 ++++++++++++++++++- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index d1ff0bd605..08e87cb87a 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -360,6 +360,7 @@ def test_ordinal_encoder_diff_na_types(): # --> diff combinations of parameters +# --> test args that can be arrays as arrays def test_handle_unknown(): @@ -762,7 +763,7 @@ def test_ordinal_encoder_features_to_encode_no_col_names(): assert [X_t[col].dtype == "uint8" for col in X_t] -def test_ordinal_encoder_top_n_categories_always_the_same_when_no_ties(): +def test_ordinal_encoder_top_n_categories_always_the_same(): df = pd.DataFrame( { "categories": ["cat_1"] * 5 @@ -813,8 +814,7 @@ def check_df_equality(random_seed): # ), # ], # ) -# def test_ordinal_encoder_woodwork_custom_overrides_returned_by_components(X_df): -# --> what's the point of adding this?? + # y = pd.Series([1, 2, 1]) # override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean] # for logical_type in override_types: @@ -843,6 +843,7 @@ def test_ordinal_encoder_output_doubles(): "bool": [bool(i % 2) for i in range(100)], "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, "integers": [i for i in range(100)], + "doubles": [i * 1.0 for i in range(100)], }, ) X.ww.init( @@ -862,3 +863,63 @@ def test_ordinal_encoder_output_doubles(): else: assert str(types) == "Double" assert len(output.columns) == len(X.columns) + + +@pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"]) +def test_data_types(data_type): + if data_type == "list": + X = [["a"], ["b"], ["c"]] * 5 + elif data_type == "np": + X = np.array([["a"], ["b"], ["c"]] * 5) + elif data_type == "pd_no_index": + X = pd.DataFrame(["a", "b", "c"] * 5) + elif data_type == "pd_index": + # --> doing int 0 here might defeat the purpose of the no index one? + X = pd.DataFrame(["a", "b", "c"] * 5, columns=[0]) + elif data_type == "ww": + X = pd.DataFrame(["a", "b", "c"] * 5) + X.ww.init( + logical_types={ + 0: Ordinal(order=["a", "b", "c"]), + }, + ) + encoder = OrdinalEncoder() + encoder.fit(X) + X_t = encoder.transform(X) + + if data_type != "ww": + # Woodwork wont infer Ordinal, so none of the other types will encode features + assert not encoder.features_to_encode + expected_df = pd.DataFrame( + [["a"], ["b"], ["c"]] * 5, + columns=[0], + dtype="category", + ) + pd.testing.assert_frame_equal(X_t, expected_df) + else: + assert list(X_t.columns) == ["0_ordinally_encoded"] + expected_df = pd.DataFrame( + [[0], [1], [2]] * 5, + columns=["0_ordinally_encoded"], + dtype="float64", + ) + pd.testing.assert_frame_equal(X_t, expected_df) + + +""" +Tests I didn't include from the ohe tests and why + +were not relevant to the ordinal encoder + - test_drop_first + - test_drop_binary + - test_drop_parameter_is_array + - test_drop_binary_and_top_n_2 + - test_ohe_column_names_unique +Couldn't understand the reason for + - test_categorical_dtype + - test_all_numerical_dtype + - test_ordinal_encoder_woodwork_custom_overrides_returned_by_components +Seemed redundant to other tests + - test_more_top_n_unique_values_large + - test_large_number_of_categories - kind of just another test of top_n arg +""" From 8db09051986ffdd18117312f8ca6af3f918863a7 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Tue, 20 Sep 2022 17:45:59 -0400 Subject: [PATCH 07/20] Start cleaning up and covering edge cases --- .../transformers/encoders/ordinal_encoder.py | 59 ++++++++++++++----- .../component_tests/test_ordinal_encoder.py | 59 ++++++++++++++++--- 2 files changed, 96 insertions(+), 22 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 8ab7ef6817..8ae62d50e3 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -22,6 +22,36 @@ class OrdinalEncoderMeta(ComponentBaseMeta): class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): + """A transformer that encodes ordinal features as an array of ordinal integers representing + the relative order of categories. + + Args: + top_n (int): Number of categories per column to encode. If None, all categories will be encoded. + Otherwise, the `n` most frequent will be encoded and all others will be handled as unknown values. + To not have unknown values raise an error, set handle_unknown to "use_encoded_value". + Defaults to 10. + features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. + If None, all appropriate columns will be encoded. Defaults to None. + categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories + for the column at index `i`. The order of categories for a column does not matter. + Any category not present in categories will be handled as an unknown value. + To not have unknown values raise an error, set handle_unknown to "use_encoded_value". + This can also be `None` or `"auto"` if `top_n` is not None. Cannot be specified if top_n + is specified. Defaults to None. + handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories + for a feature encountered during `fit` or `transform`. When set to "error", + an error will be raised when an unknown category is found. + When set to "use_encoded_value", unknown categories will be encoded as the value given + for the parameter unknown_value. Defaults to "error." + unknown_value (int or np.nan): The value to use for unknown categories seen during fit or transform. + Required when the parameter handle_unknown is set to "use_encoded_value." + The value has to be distinct from the values used to encode any of the categories in fit. + Defaults to None. + encoded_missing_value (int or np.nan): The value to use for missing (null) values seen during + fit or transform. Defaults to np.nan. #--> do we need an option to raise an error here? + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + name = "Ordinal Encoder" hyperparameter_ranges = {} """{}""" @@ -33,14 +63,14 @@ def __init__( categories=None, handle_unknown="error", unknown_value=None, - encoded_missing_value=np.nan, # --> maybe this should be np.nan since that's the utils ddefault + encoded_missing_value=np.nan, random_seed=0, **kwargs, ): parameters = { "top_n": top_n, "features_to_encode": features_to_encode, - "categories": categories, # --> the cols must have their categories set - so maybe don't need this set? + "categories": categories, "handle_unknown": handle_unknown, "unknown_value": unknown_value, "encoded_missing_value": encoded_missing_value, @@ -62,7 +92,6 @@ def __init__( if top_n is not None and categories is not None: raise ValueError("Cannot use categories and top_n arguments simultaneously") - # --> add a check for encoded_missing_values is int or npnan? What about unknown value? self.features_to_encode = features_to_encode self._encoder = None @@ -81,26 +110,30 @@ def _get_ordinal_cols(X): def fit(self, X, y=None): top_n = self.parameters["top_n"] - # --> we don't ever infer as Ordinal if theyre not set before this wont get used + # Ordinal type is not inferred by Woodwork, so if it wasn't set before, it won't be set at init X = infer_feature_types(X) if self.features_to_encode is None: - # --> should update to not include ordinals? Maybe that's configurable based on whether ordinal encoder is used? self.features_to_encode = self._get_ordinal_cols(X) + else: + logical_types = X.ww.logical_types + for col in self.features_to_encode: + ltype = logical_types[col] + if not isinstance(ltype, Ordinal): + raise TypeError( + f"Column {col} specified in features_to_encode is not Ordinal in nature", + ) X_t = X invalid_features = [ col for col in self.features_to_encode if col not in list(X.columns) ] if len(invalid_features) > 0: - # --> what if features to encode includes non ordinal cols? raise ValueError( "Could not find and encode {} in input data.".format( ", ".join(invalid_features), ), ) - # helper util to handle unknown ? Probs not needed bc I think the encoder can do wha twe need - # --> handle categories logic - includes topn - which means we do need to do value counts when theres more than n values if len(self.features_to_encode) == 0: categories = "auto" elif self.parameters["categories"] is not None: @@ -113,12 +146,13 @@ def fit(self, X, y=None): "Categories argument must contain a list of categories for each categorical feature", ) - # Categories should be in the same order as the data's Ordinal.order categories - # even if it's a subset + # Categories, as they're passed into SKOrdinalEncoder should be in the same order + # as the data's Ordinal.order categories even if it's a subset # --> refactor this to be nicer categories = [] for i, col_categories in enumerate(input_categories): categories_order = X.ww.logical_types[X.columns[i]].order + ordered_categories = [ cat for cat in categories_order if cat in col_categories ] @@ -131,16 +165,13 @@ def fit(self, X, y=None): ww_logical_types = X.ww.logical_types for col in X_t[self.features_to_encode]: ltype = ww_logical_types[col] - assert isinstance(ltype, Ordinal) - # --> if this is sampled data, the order might not be accurate? + column_categories = ltype.order if top_n is None or len(column_categories) <= top_n: unique_values = column_categories else: value_counts = X_t[col].value_counts(dropna=False).to_frame() - # --> is it worth comparing to the column's order? maybe not - # --> i assume randomness here is so that we aren' always using the same columns? value_counts = value_counts.sample( frac=1, random_state=self._initial_state, diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 08e87cb87a..598071d352 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -1,8 +1,10 @@ import re +from multiprocessing.sharedctypes import Value import numpy as np import pandas as pd import pytest +from pyexpat import features from woodwork.logical_types import Ordinal from evalml.exceptions import ComponentNotYetFittedError @@ -93,15 +95,51 @@ def test_invalid_inputs(): # --> test feats to encode includes non ordinals -def test_ordinal_encoder_col_missing(): +def test_top_n_error_without_handle_unknown(): X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) - encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) + encoder = OrdinalEncoder(top_n=2) - with pytest.raises(ValueError, match="Could not find and encode"): + error_segment = "Found unknown categories" + with pytest.raises(ValueError, match=error_segment): + encoder.fit(X) + + +def test_features_to_encode_non_ordinal_cols(): + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) + + encoder = OrdinalEncoder(features_to_encode=["col_1"]) + error = "Column col_1 specified in features_to_encode is not Ordinal in nature" + with pytest.raises(TypeError, match=error): encoder.fit(X) +def test_categories_specified_not_present_in_data(): + """Make sure that we can handle categories during fit that aren't present in + the data so that they can be seen during transform + """ + X = pd.DataFrame({"col_1": ["a", "b", "a", "c", "d"]}) + X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d"])}) + # --> weird that we have to put empty list for columns that arent ordinal in nature when specifing categories.... + # check with OHE if that's the case?? + # -->THIS IS STILL FAILING + + encoder = OrdinalEncoder( + top_n=None, + categories=[["a", "x"]], + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + + X_2 = pd.DataFrame({"col_1": ["a", "b", "a", "c", "x"]}) + X_2.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "x"])}) + X_t = encoder.transform(X_2) + assert list(X_t["col_1_ordinally_encoded"]) == [0, -1, 0, -1, 1] + + def test_ordinal_encoder_is_no_op_for_df_of_non_ordinal_features(): encoder = OrdinalEncoder(handle_missing="error") X = pd.DataFrame( @@ -350,13 +388,13 @@ def test_ordinal_encoder_diff_na_types(): "col_3": Ordinal(order=categories[2]), }, ) - encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) + encoder = OrdinalEncoder(encoded_missing_value=-1) encoder.fit(X) X_t = encoder.transform(X) - # Confirm all are still null - assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) - assert pd.isna(X_t["col_2_ordinally_encoded"].iloc[-1]) - assert pd.isna(X_t["col_3_ordinally_encoded"].iloc[-1]) + # Confirm were recognized as null and encoded + assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 + assert X_t["col_2_ordinally_encoded"].iloc[-1] == -1 + assert X_t["col_3_ordinally_encoded"].iloc[-1] == -1 # --> diff combinations of parameters @@ -615,6 +653,11 @@ def test_numpy_input(): pd.testing.assert_frame_equal(pd.DataFrame(X), X_t, check_dtype=False) +# --> thest case when top_n > # categories and handle unknown is error +# --> case when top_n < # cats and handle unknown is error +# --> unknown value not seen during fit seen at transform + + @pytest.mark.parametrize( "index", [ From c9aa18453a075a47510d0946c51b5e498e04dcec Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Wed, 28 Sep 2022 10:34:30 -0500 Subject: [PATCH 08/20] Finish filling out tests and remove unnecessary arrow comments --- .../transformers/encoders/ordinal_encoder.py | 44 +++--- .../component_tests/test_ordinal_encoder.py | 135 ++++++++---------- 2 files changed, 80 insertions(+), 99 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 8ae62d50e3..c3cd680454 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -31,9 +31,10 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): To not have unknown values raise an error, set handle_unknown to "use_encoded_value". Defaults to 10. features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. - If None, all appropriate columns will be encoded. Defaults to None. + If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter. categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories - for the column at index `i`. The order of categories for a column does not matter. + for the column at index `i` in the dataframes passed in at fit and transform. + The order of categories specified for a column does not matter. Any category not present in categories will be handled as an unknown value. To not have unknown values raise an error, set handle_unknown to "use_encoded_value". This can also be `None` or `"auto"` if `top_n` is not None. Cannot be specified if top_n @@ -48,7 +49,7 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): The value has to be distinct from the values used to encode any of the categories in fit. Defaults to None. encoded_missing_value (int or np.nan): The value to use for missing (null) values seen during - fit or transform. Defaults to np.nan. #--> do we need an option to raise an error here? + fit or transform. Defaults to np.nan. random_seed (int): Seed for the random number generator. Defaults to 0. """ @@ -115,6 +116,18 @@ def fit(self, X, y=None): if self.features_to_encode is None: self.features_to_encode = self._get_ordinal_cols(X) else: + # When features_to_encode is user-specified, check that all columns are present + # and have the Ordinal logical type + not_present_features = [ + col for col in self.features_to_encode if col not in list(X.columns) + ] + if len(not_present_features) > 0: + raise ValueError( + "Could not find and encode {} in input data.".format( + ", ".join(not_present_features), + ), + ) + logical_types = X.ww.logical_types for col in self.features_to_encode: ltype = logical_types[col] @@ -123,17 +136,12 @@ def fit(self, X, y=None): f"Column {col} specified in features_to_encode is not Ordinal in nature", ) - X_t = X - invalid_features = [ - col for col in self.features_to_encode if col not in list(X.columns) - ] - if len(invalid_features) > 0: - raise ValueError( - "Could not find and encode {} in input data.".format( - ", ".join(invalid_features), - ), - ) + # Put features_to_encode in the same relative order as the columns in the dataframe + self.features_to_encode = [ + col for col in X.columns if col in self.features_to_encode + ] + X_t = X if len(self.features_to_encode) == 0: categories = "auto" elif self.parameters["categories"] is not None: @@ -143,7 +151,7 @@ def fit(self, X, y=None): list, ): raise ValueError( - "Categories argument must contain a list of categories for each categorical feature", + "Categories argument must contain a list of categories for each ordinal feature", ) # Categories, as they're passed into SKOrdinalEncoder should be in the same order @@ -151,15 +159,12 @@ def fit(self, X, y=None): # --> refactor this to be nicer categories = [] for i, col_categories in enumerate(input_categories): - categories_order = X.ww.logical_types[X.columns[i]].order + categories_order = X.ww.logical_types[self.features_to_encode[i]].order ordered_categories = [ cat for cat in categories_order if cat in col_categories ] categories.append(ordered_categories) - - # --> should we compare with the ordinal categories to make sure they're all at least in there? - # --> if so, add a test else: categories = [] ww_logical_types = X.ww.logical_types @@ -176,7 +181,6 @@ def fit(self, X, y=None): frac=1, random_state=self._initial_state, ) - # --> make sure this is sorting on the number value_counts = value_counts.sort_values( [col], ascending=False, @@ -222,7 +226,6 @@ def transform(self, X, y=None): index=X_copy.index, ) X_cat.columns = self._get_feature_names() - # --> could we do Integer or IntegerNullably? Maybe but Double is simpler X_cat.ww.init(logical_types={c: "Double" for c in X_cat.columns}) self._feature_names = X_cat.columns @@ -251,7 +254,6 @@ def _get_feature_names(self): return unique_names def categories(self, feature_name): - # --> need to make sure this works """Returns a list of the unique categories to be encoded for the particular feature, in order. Args: diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 598071d352..dc2d5e298d 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -74,12 +74,16 @@ def test_invalid_inputs(): [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]], ) encoder = OrdinalEncoder(top_n=None, categories=[["a", "b"], ["a", "c"]]) - error_msg = "Categories argument must contain a list of categories for each categorical feature" + error_msg = ( + "Categories argument must contain a list of categories for each ordinal feature" + ) with pytest.raises(ValueError, match=error_msg): encoder.fit(X) encoder = OrdinalEncoder(top_n=None, categories=["a", "b", "c"]) - error_msg = "Categories argument must contain a list of categories for each categorical feature" + error_msg = ( + "Categories argument must contain a list of categories for each ordinal feature" + ) with pytest.raises(ValueError, match=error_msg): encoder.fit(X) @@ -91,8 +95,48 @@ def test_invalid_inputs(): OrdinalEncoder(top_n=11, categories=categories, random_seed=2) -# --> test no transformation when ordinal type not set and hyes when explicitly set -# --> test feats to encode includes non ordinals +def test_categories_list_not_passed_in_for_non_ordinal_column(): + """We indicate that the categories argument must contain categories only for each ordinal + feature, so test the case where we pass in a categories list when not every column is ordinal. + """ + X = pd.DataFrame( + { + "col_1": [2, 0, 1, 0, 0], + "col_2": ["a", "b", "a", "c", "d"], + "col_3": ["x", "x", "x", "y", "y"], + }, + ) + X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) + + encoder = OrdinalEncoder(top_n=None, categories=[["a", "b", "c", "d"]]) + encoder.fit(X) + + assert len(encoder._encoder.categories_) == len(encoder.features_to_encode) + + error = 'Feature "col_1" was not provided to ordinal encoder as a training feature' + with pytest.raises(ValueError, match=error): + encoder.categories("col_1") + + # When features_to_encode is passed in, confirm the order there doesn't matter and order of + # cols in X is still used in indexing into categories + X.ww.init( + logical_types={ + "col_2": Ordinal(order=["a", "b", "c", "d"]), + "col_3": Ordinal(order=["x", "y"]), + }, + ) + encoder = OrdinalEncoder( + top_n=None, + # features_to_encode passed in different order than the dataframe's cols + features_to_encode=["col_3", "col_2"], + # categories' order still matches the dataframe's cols + categories=[["a", "b", "c", "d"], ["x", "y"]], + ) + encoder.fit(X) + + assert len(encoder._encoder.categories_) == len(encoder.features_to_encode) + set(encoder.categories("col_2")) == {"a", "b", "c", "d"} + set(encoder.categories("col_3")) == {"x", "y"} def test_top_n_error_without_handle_unknown(): @@ -118,13 +162,14 @@ def test_features_to_encode_non_ordinal_cols(): def test_categories_specified_not_present_in_data(): """Make sure that we can handle categories during fit that aren't present in - the data so that they can be seen during transform + the data so that they can be seen during transform. Note that because we fit on the + Ordinal.order passed in at fit, that order is the source of truth for + potential categories available at transform. In this test, that means that "x", + though not in the data at fit, must be in the order in order to not be viewed + as an unknown value at transform. """ X = pd.DataFrame({"col_1": ["a", "b", "a", "c", "d"]}) - X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d"])}) - # --> weird that we have to put empty list for columns that arent ordinal in nature when specifing categories.... - # check with OHE if that's the case?? - # -->THIS IS STILL FAILING + X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) encoder = OrdinalEncoder( top_n=None, @@ -133,9 +178,10 @@ def test_categories_specified_not_present_in_data(): unknown_value=-1, ) encoder.fit(X) + assert set(encoder.categories("col_1")) == {"a", "x"} X_2 = pd.DataFrame({"col_1": ["a", "b", "a", "c", "x"]}) - X_2.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "x"])}) + X_2.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) X_t = encoder.transform(X_2) assert list(X_t["col_1_ordinally_encoded"]) == [0, -1, 0, -1, 1] @@ -198,9 +244,6 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): assert list(category_list) == expected_categories[i] -# --> test setting non ordinal col in features to encode - - def test_ordinal_encoder_categories_set_correctly_from_fit(): # The SKOrdinalEncoder.categories_ attribute is what determines what gets encoded # So we're checking how that gets set during fit @@ -221,13 +264,6 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): for i, category_list in enumerate(encoder._encoder.categories_): assert list(category_list) == categories[i] - # Categories set explicitly - means top_n must be set to None - # --> this behavior should be tested elsewhere?? - # encoder = OrdinalEncoder(top_n=None, categories=subset_categories) - # with pytest.raises(ValueError) as exec_info: - # encoder.fit(X) - # assert "Found unknown categories" in exec_info.value.args[0] - # Categories set at init explicitly - means we have to set top_n to None # and handle the unknown case subset_categories = [["a"], ["a"], ["a"]] @@ -241,8 +277,6 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): for i, category_list in enumerate(encoder._encoder.categories_): assert list(category_list) == subset_categories[i] - # --> feels weird that you have to supply these values when just topn is set - # --> do we need to mention tie behavior for top_n? # Categories not specified, but top_n specified to limit categories encoder = OrdinalEncoder( top_n=1, @@ -255,8 +289,6 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): assert list(category_list) == expected_categories[i] -# --> test feature names -# --> test encoded feature values def test_ordinal_encoder_transform(): X = pd.DataFrame( { @@ -301,7 +333,7 @@ def test_null_values_in_dataframe(): "a", "c", "c", - ], # --> add test where one is none and the other is nan and another is pd.na + ], "col_3": ["a", "a", "a", "a", "a"], }, ) @@ -397,10 +429,6 @@ def test_ordinal_encoder_diff_na_types(): assert X_t["col_3_ordinally_encoded"].iloc[-1] == -1 -# --> diff combinations of parameters -# --> test args that can be arrays as arrays - - def test_handle_unknown(): X = pd.DataFrame( { @@ -441,9 +469,6 @@ def test_handle_unknown(): assert "Found unknown categories" in exec_info.value.args[0] -# --> passed in categories have a different sorted order than that of the data itsef - use ordinal order as sourceo f truth and just inpput param as ways to specify what subset - - def test_no_top_n(): # test all categories in all columns are encoded when top_n is None X = pd.DataFrame( @@ -653,11 +678,6 @@ def test_numpy_input(): pd.testing.assert_frame_equal(pd.DataFrame(X), X_t, check_dtype=False) -# --> thest case when top_n > # categories and handle unknown is error -# --> case when top_n < # cats and handle unknown is error -# --> unknown value not seen during fit seen at transform - - @pytest.mark.parametrize( "index", [ @@ -839,47 +859,6 @@ def check_df_equality(random_seed): check_df_equality(get_random_seed(5)) -# @pytest.mark.parametrize( -# "X_df", -# [ -# pd.DataFrame( -# pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d"), -# ), -# pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), -# pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), -# pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), -# pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), -# pd.DataFrame( -# pd.Series( -# ["this will be a natural language column because length", "yay", "hay"], -# dtype="string", -# ), -# ), -# ], -# ) - -# y = pd.Series([1, 2, 1]) -# override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean] -# for logical_type in override_types: -# try: -# X = X_df.copy() -# X.ww.init(logical_types={0: logical_type}) -# except (TypeConversionError, ValueError, TypeError): -# continue -# if X.loc[:, 0].isna().all(): -# # Casting the fourth and fifth dataframes to datetime will produce all NaNs -# continue - -# ordinal_encoder = OrdinalEncoder() -# ordinal_encoder.fit(X, y) -# transformed = ordinal_encoder.transform(X, y) -# assert isinstance(transformed, pd.DataFrame) -# if logical_type != Categorical: -# assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { -# 0: logical_type, -# } - - def test_ordinal_encoder_output_doubles(): X = pd.DataFrame( { From 30c2d4cda47758bcff1987318928942f2e581a5c Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Wed, 28 Sep 2022 12:08:27 -0500 Subject: [PATCH 09/20] clean up --- .../transformers/encoders/onehot_encoder.py | 2 +- .../transformers/encoders/ordinal_encoder.py | 60 ++++--- .../component_tests/test_one_hot_encoder.py | 1 - .../component_tests/test_ordinal_encoder.py | 148 +++++++++++++----- 4 files changed, 147 insertions(+), 64 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 084b172151..de6a6efd6d 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -110,7 +110,7 @@ def fit(self, X, y=None): top_n = self.parameters["top_n"] X = infer_feature_types(X) if self.features_to_encode is None: - # --> should update to not include ordinals? Maybe that's configurable based on whether ordinal encoder is used? + # --> should update to not include ordinals once the ord encoder is integragted? Maybe that's configurable based on whether ordinal encoder is used? self.features_to_encode = self._get_cat_cols(X) X_t = X diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index c3cd680454..3d0d727eb4 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -142,9 +142,13 @@ def fit(self, X, y=None): ] X_t = X + + ww_logical_types = X.ww.logical_types if len(self.features_to_encode) == 0: + # No ordinal features present - categories don't need to be specified categories = "auto" elif self.parameters["categories"] is not None: + # Categories specified - make sure they match the ordinal columns input_categories = self.parameters["categories"] if len(input_categories) != len(self.features_to_encode) or not isinstance( input_categories[0], @@ -156,27 +160,29 @@ def fit(self, X, y=None): # Categories, as they're passed into SKOrdinalEncoder should be in the same order # as the data's Ordinal.order categories even if it's a subset - # --> refactor this to be nicer categories = [] for i, col_categories in enumerate(input_categories): - categories_order = X.ww.logical_types[self.features_to_encode[i]].order + categories_order = ww_logical_types[self.features_to_encode[i]].order ordered_categories = [ cat for cat in categories_order if cat in col_categories ] categories.append(ordered_categories) else: + # Categories unspecified - use ordered categories from a columns' Ordinal logical type + # and account for top_n categories = [] - ww_logical_types = X.ww.logical_types for col in X_t[self.features_to_encode]: ltype = ww_logical_types[col] + # Copy the order list, since we might mutate it later by adding nans + # and don't want to impact the Woodwork types + column_ordered_categories = ltype.order.copy() - column_categories = ltype.order - - if top_n is None or len(column_categories) <= top_n: - unique_values = column_categories + if top_n is None or len(column_ordered_categories) <= top_n: + unique_values = column_ordered_categories else: - value_counts = X_t[col].value_counts(dropna=False).to_frame() + # Don't count nans as a value + value_counts = X_t[col].value_counts(dropna=True).to_frame() value_counts = value_counts.sample( frac=1, random_state=self._initial_state, @@ -187,19 +193,22 @@ def fit(self, X, y=None): kind="mergesort", ) unique_values = value_counts.head(top_n).index.tolist() - - # Categories should be in the same order as the data's Ordinal.order categories - categories_order = X.ww.logical_types[col].order - unique_values_in_order = [ - cat for cat in categories_order if cat in unique_values - ] - categories.append(unique_values_in_order) - - # Categories should be in the same order as the data's Ordinal.order categories - - # Add any null values into the categories lists so that they can get handled correctly + # Categories should be in the same order as the data's Ordinal.order categories + # instead of sorted by most frequent + unique_values = [ + cat for cat in column_ordered_categories if cat in unique_values + ] + + categories.append(unique_values) + + # Add any null values into the categories lists so that they aren't treated as unknown values + # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder + # requires any null values be present in the categories list if they are to be encoded as + # missing values if isinstance(categories, list): for i, col in enumerate(X_t[self.features_to_encode]): + # --> handle nans as error here if that's a behavior we want + # --> and if we want the option to handle nans as unknowns do that here as well if X_t[col].isna().any(): categories[i] += [np.nan] @@ -219,17 +228,17 @@ def transform(self, X, y=None): X_copy = X.ww.copy() X = X.ww.drop(columns=self.features_to_encode) - # Call sklearn's transform on the ordinal columns + # Call sklearn's transform on only the ordinal columns if len(self.features_to_encode) > 0: - X_cat = pd.DataFrame( + X_ord = pd.DataFrame( self._encoder.transform(X_copy[self.features_to_encode]), index=X_copy.index, ) - X_cat.columns = self._get_feature_names() - X_cat.ww.init(logical_types={c: "Double" for c in X_cat.columns}) - self._feature_names = X_cat.columns + X_ord.columns = self._get_feature_names() + X_ord.ww.init(logical_types={c: "Double" for c in X_ord.columns}) + self._feature_names = X_ord.columns - X = ww.utils.concat_columns([X, X_cat]) + X = ww.utils.concat_columns([X, X_ord]) return X @@ -249,7 +258,6 @@ def _get_feature_names(self): encoded_name = f"{col_name}_ordinally_encoded" unique_names.append(encoded_name) provenance[col_name] = [encoded_name] - # --> make sure provenance should point to a list even with only one element self._provenance = provenance return unique_names diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 0f287f4c15..942a0b6f0a 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -483,7 +483,6 @@ def test_more_top_n_unique_values_large(): def test_categorical_dtype(): - # --> I dont get what this is testin # test that columns with the categorical type are encoded properly X = pd.DataFrame( { diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index dc2d5e298d..14f8926eab 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -234,7 +234,6 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): assert encoder.features_to_encode == ["col_1", "col_2", "col_3"] assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_) - # --> this isn't really testing its ability to recognize ordinals - its testing features to encode encoder = OrdinalEncoder(features_to_encode=["col_1"]) encoder.fit(X) assert encoder.features_to_encode == ["col_1"] @@ -359,74 +358,113 @@ def test_null_values_in_dataframe(): X_t = encoder.transform(X) assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 + +def test_ordinal_encoder_diff_na_types(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": ["a", "b", "a", "c", None], + "col_3": ["a", "a", "a", "a", pd.NA], + }, + ) + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + "col_3": Ordinal(order=categories[2]), + }, + ) + encoder = OrdinalEncoder(encoded_missing_value=-1) + encoder.fit(X) + X_t = encoder.transform(X) + # Confirm were recognized as null and encoded + assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 + assert X_t["col_2_ordinally_encoded"].iloc[-1] == -1 + assert X_t["col_3_ordinally_encoded"].iloc[-1] == -1 + + +def test_null_values_with_top_n(): # --> not sure that this is the desired behavior - in ordinal_encoder it gets treated as its own category - # Test NaN doesn't get counted as a category to encode, so it still gets - # encoded as missing and not unknown even if it's not in the top n + """Null values shouldn't get counted towards the top_n, so check that if nan is inside or + outside of top_n, the behavior doesn't change.""" + # nan would be in the top_n of col 1 but not 2 if it counted towards top_n X = pd.DataFrame( { - "col_1": ["a", "a", "c", "c", np.nan], - "col_2": ["a", "b", "a", "c", "b"], + "col_1": ["a", "b", "c", np.nan, np.nan], + "col_2": [ + "a", + "b", + "a", + "c", + np.nan, + ], "col_3": ["a", "a", "a", "a", "a"], - "col_4": [2, 0, 1, np.nan, 0], }, ) - categories = [["a", "b", "c"], ["a", "b", "c"], ["a"]] + # Note - we cant include the null value in the categories used by Woodwork + # because it sets the pandas dtypes' categories and they can't include a null value + categories = [["a", "b", "c", "d"], ["a", "b", "c"]] X.ww.init( logical_types={ "col_1": Ordinal(order=categories[0]), "col_2": Ordinal(order=categories[1]), }, ) + + # Nans should not get handled as unknown even if they're not within the top n encoder = OrdinalEncoder( - top_n=2, + top_n=1, handle_unknown="use_encoded_value", unknown_value=-1, ) encoder.fit(X) X_t = encoder.transform(X) + # Check that none of the null values were treated as unknown values assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) - assert X_t["col_2_ordinally_encoded"].iloc[3] == -1 - - # Test handle_missing='error' throws an error - # --> not currently an option - should we add? - # encoder_error = OrdinalEncoder(handle_missing="error") - - # X = pd.DataFrame( - # { - # "col_1": [np.nan, "b", "c", "d", "e", "f", "g"], - # "col_2": ["a", "c", "d", "b", "e", "e", "f"], - # "col_3": ["a", "a", "a", "a", "a", "a", "b"], - # "col_4": [2, 0, 1, 3, 0, 1, 2], - # }, - # ) - # X.ww.init(logical_types={"col_1": "categorical"}) - # with pytest.raises(ValueError, match="Input contains NaN"): - # encoder_error.fit(X) + assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-2]) + assert pd.isna(X_t["col_2_ordinally_encoded"].iloc[-1]) -def test_ordinal_encoder_diff_na_types(): +def test_null_values_with_categories_specified(): + """Nans aren't treated by Woodwork as categories in ordinal cols, so they shouldn't + have an impact on the categories parameter and be handled entirely independently.""" X = pd.DataFrame( { - "col_1": ["a", "b", "c", "d", np.nan], - "col_2": ["a", "b", "a", "c", None], - "col_3": ["a", "a", "a", "a", pd.NA], + "col_1": ["a", "b", "c", np.nan, np.nan], + "col_2": [ + "a", + "b", + "a", + "c", + np.nan, + ], + "col_3": ["a", "a", "a", "a", "a"], }, ) - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + # Note - we cant include the null value in the categories used by Woodwork + # because it sets the pandas dtypes' categories and they can't include a null value + categories = [["a", "b", "c", "d"], ["a", "b", "c"]] X.ww.init( logical_types={ "col_1": Ordinal(order=categories[0]), "col_2": Ordinal(order=categories[1]), - "col_3": Ordinal(order=categories[2]), }, ) - encoder = OrdinalEncoder(encoded_missing_value=-1) + + # Try putting a nan in the categories list in one of the columns but not the other + encoder = OrdinalEncoder( + categories=[["a"], ["a", np.nan]], + handle_unknown="use_encoded_value", + unknown_value=-1, + top_n=None, + ) encoder.fit(X) X_t = encoder.transform(X) - # Confirm were recognized as null and encoded - assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 - assert X_t["col_2_ordinally_encoded"].iloc[-1] == -1 - assert X_t["col_3_ordinally_encoded"].iloc[-1] == -1 + # Check that the null values were handled as missing even when they're present in categories + assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) + assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-2]) + assert pd.isna(X_t["col_2_ordinally_encoded"].iloc[-1]) def test_handle_unknown(): @@ -555,6 +593,44 @@ def test_categories_set_at_init(): assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] +def test_categories_includes_not_present_value(): + """This tests the case where the categories we pass into the encoder include + values that aren't in the data or even the Ordinal.order.""" + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + full_categories = [ + ["a", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b"], + ] + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) + + # Categories passed in has value "x" that's not in the data + categories = [["a", "x"], ["a", "x"], ["a", "x"]] + + # test categories value works when transforming + encoder = OrdinalEncoder( + top_n=None, + categories=categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + random_seed=2, + ) + encoder.fit(X) + assert set(encoder.categories("col_1")) == {"a"} + assert set(encoder.categories("col_2")) == {"a"} + assert set(encoder.categories("col_3")) == {"a"} + + def test_categories_different_order_from_ltype(): # The order of categories comes from the Ordinal.order property of the data. # Categories passed in as input to the encoder just determine what subset should From 036e074b8b9026a1b3d9dae03cc412fb84ef8126 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Wed, 28 Sep 2022 12:25:45 -0500 Subject: [PATCH 10/20] lint fix and fill out docstrings --- .../transformers/encoders/ordinal_encoder.py | 40 ++++++++++++++++--- .../component_tests/test_ordinal_encoder.py | 4 -- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 3d0d727eb4..35ac1b13d6 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -1,6 +1,4 @@ -"""A transformer that encodes categorical features in a one-hot numeric array.""" -from pdb import set_trace - +"""A transformer that encodes ordinal features as an array of ordinal integers representing the relative order of categories.""" import numpy as np import pandas as pd import woodwork as ww @@ -15,6 +13,8 @@ class OrdinalEncoderMeta(ComponentBaseMeta): + """A version of the ComponentBaseMeta class which includes validation on an additional ordinal-encoder-specific method `categories`.""" + METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + [ "categories", "get_feature_names", @@ -22,8 +22,7 @@ class OrdinalEncoderMeta(ComponentBaseMeta): class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): - """A transformer that encodes ordinal features as an array of ordinal integers representing - the relative order of categories. + """A transformer that encodes ordinal features as an array of ordinal integers representing the relative order of categories. Args: top_n (int): Number of categories per column to encode. If None, all categories will be encoded. @@ -110,6 +109,19 @@ def _get_ordinal_cols(X): return list(X.ww.select(include=["ordinal"], return_schema=True).columns) def fit(self, X, y=None): + """Fits the ordinal encoder component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + + Raises: + ValueError: If encoding a column failed. + TypeError: If non-Ordinal columns are specified in features_to_encode. + """ top_n = self.parameters["top_n"] # Ordinal type is not inferred by Woodwork, so if it wasn't set before, it won't be set at init X = infer_feature_types(X) @@ -223,6 +235,17 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): + """Ordinally encode the input data. + + Args: + X (pd.DataFrame): Features to encode. + y (pd.Series): Ignored. + + Returns: + pd.DataFrame: Transformed data, where each ordinal feature has been encoded into + a numerical column using where of ordinal integers represent + the relative order of categories. + """ X = infer_feature_types(X) X_copy = X.ww.copy() @@ -282,6 +305,13 @@ def categories(self, feature_name): return self._encoder.categories_[index] def get_feature_names(self): + """Return feature names for the ordinal features after fitting. + + Feature names are formatted as {column name}_ordinally_encoded. + + Returns: + np.ndarray: The feature names after encoding, provided in the same order as input_features. + """ return self._get_feature_names() def _get_feature_provenance(self): diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 14f8926eab..14b316fb5b 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -1,10 +1,6 @@ -import re -from multiprocessing.sharedctypes import Value - import numpy as np import pandas as pd import pytest -from pyexpat import features from woodwork.logical_types import Ordinal from evalml.exceptions import ComponentNotYetFittedError From 12af4d22dd13dc9d6adf2aaeab909a5065d0a839 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Wed, 28 Sep 2022 13:03:12 -0500 Subject: [PATCH 11/20] Add release note --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index f5324d42bf..d009bad43c 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -20,6 +20,7 @@ Release Notes * Enhancements * Enhanced Decomposer with ``determine_periodicity`` function to automatically determine periodicity of seasonal target. :pr:`3729` * Enhanced Decomposer with ``set_seasonal_period`` function to set a ``Decomposer`` object's seasonal period automatically. :pr:`3729` + * Added ``OrdinalEncoder`` component :pr:`3736` * Fixes * Fixed holdout warning message showing when using default parameters :pr:`3727` * Fixed bug in Oversampler where categorical dtypes would fail :pr:`3732` From a4d9971eca757173d1bba6627ce806b531c1dde6 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Wed, 28 Sep 2022 13:40:59 -0500 Subject: [PATCH 12/20] Fix broken component tests --- .../components/transformers/encoders/ordinal_encoder.py | 8 ++++++-- evalml/tests/component_tests/test_ordinal_encoder.py | 4 ++-- evalml/tests/component_tests/test_utils.py | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 35ac1b13d6..35f845a275 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -63,7 +63,7 @@ def __init__( categories=None, handle_unknown="error", unknown_value=None, - encoded_missing_value=np.nan, + encoded_missing_value=None, random_seed=0, **kwargs, ): @@ -224,11 +224,15 @@ def fit(self, X, y=None): if X_t[col].isna().any(): categories[i] += [np.nan] + encoded_missing_value = self.parameters["encoded_missing_value"] + if encoded_missing_value is None: + encoded_missing_value = np.nan + self._encoder = SKOrdinalEncoder( categories=categories, handle_unknown=self.parameters["handle_unknown"], unknown_value=self.parameters["unknown_value"], - encoded_missing_value=self.parameters["encoded_missing_value"], + encoded_missing_value=encoded_missing_value, ) self._encoder.fit(X_t[self.features_to_encode]) diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 14b316fb5b..9482cb6945 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -26,7 +26,7 @@ def test_init(): "categories": None, "handle_unknown": "error", "unknown_value": None, - "encoded_missing_value": np.nan, + "encoded_missing_value": None, } encoder = OrdinalEncoder() assert encoder.parameters == parameters @@ -40,7 +40,7 @@ def test_parameters(): "categories": None, "handle_unknown": "error", "unknown_value": None, - "encoded_missing_value": np.nan, + "encoded_missing_value": None, } assert encoder.parameters == expected_parameters diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index d8af59e978..aac9e16f20 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -87,6 +87,7 @@ "LightGBM Classifier", "LightGBM Regressor", "Oversampler", + "Ordinal Encoder", "Polynomial Decomposer", "Prophet Regressor", "Target Encoder", From 85a53351aa56ce48a343151e4a94a3329134fe7b Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Wed, 28 Sep 2022 14:29:44 -0500 Subject: [PATCH 13/20] Remove remaining arrow comments --- .../components/transformers/encoders/onehot_encoder.py | 1 - .../components/transformers/encoders/ordinal_encoder.py | 4 +--- evalml/tests/component_tests/test_ordinal_encoder.py | 2 -- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index de6a6efd6d..0ac457a48b 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -110,7 +110,6 @@ def fit(self, X, y=None): top_n = self.parameters["top_n"] X = infer_feature_types(X) if self.features_to_encode is None: - # --> should update to not include ordinals once the ord encoder is integragted? Maybe that's configurable based on whether ordinal encoder is used? self.features_to_encode = self._get_cat_cols(X) X_t = X diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 35f845a275..60b9c48bdb 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -219,8 +219,6 @@ def fit(self, X, y=None): # missing values if isinstance(categories, list): for i, col in enumerate(X_t[self.features_to_encode]): - # --> handle nans as error here if that's a behavior we want - # --> and if we want the option to handle nans as unknowns do that here as well if X_t[col].isna().any(): categories[i] += [np.nan] @@ -273,7 +271,7 @@ def _get_feature_names(self): """Return feature names for the ordinal features after fitting. Since ordinal encoding creates one encoded feature per column in features_to_encode, feature - names are formatted as {column_name}_ordinally_encoded --> choose a better name?? maybe one that includes how many categories were encoded + names are formatted as {column_name}_ordinally_encoded Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 9482cb6945..094a27e2b8 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -381,7 +381,6 @@ def test_ordinal_encoder_diff_na_types(): def test_null_values_with_top_n(): - # --> not sure that this is the desired behavior - in ordinal_encoder it gets treated as its own category """Null values shouldn't get counted towards the top_n, so check that if nan is inside or outside of top_n, the behavior doesn't change.""" # nan would be in the top_n of col 1 but not 2 if it counted towards top_n @@ -968,7 +967,6 @@ def test_data_types(data_type): elif data_type == "pd_no_index": X = pd.DataFrame(["a", "b", "c"] * 5) elif data_type == "pd_index": - # --> doing int 0 here might defeat the purpose of the no index one? X = pd.DataFrame(["a", "b", "c"] * 5, columns=[0]) elif data_type == "ww": X = pd.DataFrame(["a", "b", "c"] * 5) From bb6011184b0e9aa3654e2203d56600211875e705 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 30 Sep 2022 15:36:14 -0500 Subject: [PATCH 14/20] Remove top_n parameter --- .../transformers/encoders/ordinal_encoder.py | 41 +-- .../component_tests/test_ordinal_encoder.py | 280 +----------------- 2 files changed, 15 insertions(+), 306 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 60b9c48bdb..451b128f9d 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -25,10 +25,6 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): """A transformer that encodes ordinal features as an array of ordinal integers representing the relative order of categories. Args: - top_n (int): Number of categories per column to encode. If None, all categories will be encoded. - Otherwise, the `n` most frequent will be encoded and all others will be handled as unknown values. - To not have unknown values raise an error, set handle_unknown to "use_encoded_value". - Defaults to 10. features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter. categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories @@ -36,8 +32,7 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): The order of categories specified for a column does not matter. Any category not present in categories will be handled as an unknown value. To not have unknown values raise an error, set handle_unknown to "use_encoded_value". - This can also be `None` or `"auto"` if `top_n` is not None. Cannot be specified if top_n - is specified. Defaults to None. + Defaults to None. handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories for a feature encountered during `fit` or `transform`. When set to "error", an error will be raised when an unknown category is found. @@ -50,6 +45,7 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): encoded_missing_value (int or np.nan): The value to use for missing (null) values seen during fit or transform. Defaults to np.nan. random_seed (int): Seed for the random number generator. Defaults to 0. + # --> is random seed still necessary? """ name = "Ordinal Encoder" @@ -58,17 +54,15 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): def __init__( self, - top_n=10, features_to_encode=None, categories=None, handle_unknown="error", unknown_value=None, encoded_missing_value=None, random_seed=0, - **kwargs, + **kwargs, # --> why do we have kwargs if they arent used?? Maybe needed for larger evalml pipeline ): parameters = { - "top_n": top_n, "features_to_encode": features_to_encode, "categories": categories, "handle_unknown": handle_unknown, @@ -89,8 +83,6 @@ def __init__( "be specified as either np.nan or as an int that is distinct from" "the other encoded categories ", ) - if top_n is not None and categories is not None: - raise ValueError("Cannot use categories and top_n arguments simultaneously") self.features_to_encode = features_to_encode self._encoder = None @@ -122,7 +114,6 @@ def fit(self, X, y=None): ValueError: If encoding a column failed. TypeError: If non-Ordinal columns are specified in features_to_encode. """ - top_n = self.parameters["top_n"] # Ordinal type is not inferred by Woodwork, so if it wasn't set before, it won't be set at init X = infer_feature_types(X) if self.features_to_encode is None: @@ -182,36 +173,12 @@ def fit(self, X, y=None): categories.append(ordered_categories) else: # Categories unspecified - use ordered categories from a columns' Ordinal logical type - # and account for top_n categories = [] for col in X_t[self.features_to_encode]: ltype = ww_logical_types[col] # Copy the order list, since we might mutate it later by adding nans # and don't want to impact the Woodwork types - column_ordered_categories = ltype.order.copy() - - if top_n is None or len(column_ordered_categories) <= top_n: - unique_values = column_ordered_categories - else: - # Don't count nans as a value - value_counts = X_t[col].value_counts(dropna=True).to_frame() - value_counts = value_counts.sample( - frac=1, - random_state=self._initial_state, - ) - value_counts = value_counts.sort_values( - [col], - ascending=False, - kind="mergesort", - ) - unique_values = value_counts.head(top_n).index.tolist() - # Categories should be in the same order as the data's Ordinal.order categories - # instead of sorted by most frequent - unique_values = [ - cat for cat in column_ordered_categories if cat in unique_values - ] - - categories.append(unique_values) + categories.append(ltype.order.copy()) # Add any null values into the categories lists so that they aren't treated as unknown values # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 094a27e2b8..31ff688c5f 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -21,7 +21,6 @@ def set_first_three_columns_to_ordinal_with_categories(X, categories): def test_init(): parameters = { - "top_n": 10, "features_to_encode": None, "categories": None, "handle_unknown": "error", @@ -33,14 +32,13 @@ def test_init(): def test_parameters(): - encoder = OrdinalEncoder(top_n=123) + encoder = OrdinalEncoder(encoded_missing_value=-1) expected_parameters = { - "top_n": 123, "features_to_encode": None, "categories": None, "handle_unknown": "error", "unknown_value": None, - "encoded_missing_value": None, + "encoded_missing_value": -1, } assert encoder.parameters == expected_parameters @@ -69,27 +67,20 @@ def test_invalid_inputs(): X, [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]], ) - encoder = OrdinalEncoder(top_n=None, categories=[["a", "b"], ["a", "c"]]) + encoder = OrdinalEncoder(categories=[["a", "b"], ["a", "c"]]) error_msg = ( "Categories argument must contain a list of categories for each ordinal feature" ) with pytest.raises(ValueError, match=error_msg): encoder.fit(X) - encoder = OrdinalEncoder(top_n=None, categories=["a", "b", "c"]) + encoder = OrdinalEncoder(categories=["a", "b", "c"]) error_msg = ( "Categories argument must contain a list of categories for each ordinal feature" ) with pytest.raises(ValueError, match=error_msg): encoder.fit(X) - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] - with pytest.raises( - ValueError, - match="Cannot use categories and top_n arguments simultaneously", - ): - OrdinalEncoder(top_n=11, categories=categories, random_seed=2) - def test_categories_list_not_passed_in_for_non_ordinal_column(): """We indicate that the categories argument must contain categories only for each ordinal @@ -104,7 +95,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): ) X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) - encoder = OrdinalEncoder(top_n=None, categories=[["a", "b", "c", "d"]]) + encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]]) encoder.fit(X) assert len(encoder._encoder.categories_) == len(encoder.features_to_encode) @@ -122,7 +113,6 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): }, ) encoder = OrdinalEncoder( - top_n=None, # features_to_encode passed in different order than the dataframe's cols features_to_encode=["col_3", "col_2"], # categories' order still matches the dataframe's cols @@ -135,17 +125,6 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): set(encoder.categories("col_3")) == {"x", "y"} -def test_top_n_error_without_handle_unknown(): - X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) - X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) - - encoder = OrdinalEncoder(top_n=2) - - error_segment = "Found unknown categories" - with pytest.raises(ValueError, match=error_segment): - encoder.fit(X) - - def test_features_to_encode_non_ordinal_cols(): X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) @@ -168,7 +147,6 @@ def test_categories_specified_not_present_in_data(): X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) encoder = OrdinalEncoder( - top_n=None, categories=[["a", "x"]], handle_unknown="use_encoded_value", unknown_value=-1, @@ -259,11 +237,9 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): for i, category_list in enumerate(encoder._encoder.categories_): assert list(category_list) == categories[i] - # Categories set at init explicitly - means we have to set top_n to None - # and handle the unknown case + # Categories set at init explicitly - means we have to handle the unknown case subset_categories = [["a"], ["a"], ["a"]] encoder = OrdinalEncoder( - top_n=None, categories=subset_categories, handle_unknown="use_encoded_value", unknown_value=-1, @@ -272,17 +248,6 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): for i, category_list in enumerate(encoder._encoder.categories_): assert list(category_list) == subset_categories[i] - # Categories not specified, but top_n specified to limit categories - encoder = OrdinalEncoder( - top_n=1, - handle_unknown="use_encoded_value", - unknown_value=-1, - ) - encoder.fit(X) - expected_categories = [["a"], ["b"], ["a"]] - for i, category_list in enumerate(encoder._encoder.categories_): - assert list(category_list) == expected_categories[i] - def test_ordinal_encoder_transform(): X = pd.DataFrame( @@ -380,47 +345,6 @@ def test_ordinal_encoder_diff_na_types(): assert X_t["col_3_ordinally_encoded"].iloc[-1] == -1 -def test_null_values_with_top_n(): - """Null values shouldn't get counted towards the top_n, so check that if nan is inside or - outside of top_n, the behavior doesn't change.""" - # nan would be in the top_n of col 1 but not 2 if it counted towards top_n - X = pd.DataFrame( - { - "col_1": ["a", "b", "c", np.nan, np.nan], - "col_2": [ - "a", - "b", - "a", - "c", - np.nan, - ], - "col_3": ["a", "a", "a", "a", "a"], - }, - ) - # Note - we cant include the null value in the categories used by Woodwork - # because it sets the pandas dtypes' categories and they can't include a null value - categories = [["a", "b", "c", "d"], ["a", "b", "c"]] - X.ww.init( - logical_types={ - "col_1": Ordinal(order=categories[0]), - "col_2": Ordinal(order=categories[1]), - }, - ) - - # Nans should not get handled as unknown even if they're not within the top n - encoder = OrdinalEncoder( - top_n=1, - handle_unknown="use_encoded_value", - unknown_value=-1, - ) - encoder.fit(X) - X_t = encoder.transform(X) - # Check that none of the null values were treated as unknown values - assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) - assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-2]) - assert pd.isna(X_t["col_2_ordinally_encoded"].iloc[-1]) - - def test_null_values_with_categories_specified(): """Nans aren't treated by Woodwork as categories in ordinal cols, so they shouldn't have an impact on the categories parameter and be handled entirely independently.""" @@ -452,7 +376,6 @@ def test_null_values_with_categories_specified(): categories=[["a"], ["a", np.nan]], handle_unknown="use_encoded_value", unknown_value=-1, - top_n=None, ) encoder.fit(X) X_t = encoder.transform(X) @@ -502,55 +425,6 @@ def test_handle_unknown(): assert "Found unknown categories" in exec_info.value.args[0] -def test_no_top_n(): - # test all categories in all columns are encoded when top_n is None - X = pd.DataFrame( - { - "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"], - "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2], - }, - ) - X.ww.init( - logical_types={ - "col_1": Ordinal( - order=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - ), - "col_2": Ordinal(order=["a", "b", "c", "d", "e", "f"]), - }, - ) - expected_col_names = set( - ["col_3", "col_4", "col_1_ordinally_encoded", "col_2_ordinally_encoded"], - ) - - encoder = OrdinalEncoder(top_n=None, handle_unknown="error", random_seed=2) - encoder.fit(X) - X_t = encoder.transform(X) - - col_names = set(X_t.columns) - assert col_names == expected_col_names - - # Make sure unknown values cause an error - X_new = pd.DataFrame( - { - "col_1": ["a", "b", "c", "x"], - "col_2": ["a", "c", "d", "b"], - "col_3": ["a", "a", "a", "a"], - "col_4": [2, 0, 1, 3], - }, - ) - X_new.ww.init( - logical_types={ - "col_1": Ordinal(order=["a", "b", "c", "x"]), - "col_2": Ordinal(order=["a", "b", "c", "d"]), - }, - ) - with pytest.raises(ValueError) as exec_info: - encoder.transform(X_new) - assert "Found unknown categories" in exec_info.value.args[0] - - def test_categories_set_at_init(): X = pd.DataFrame( { @@ -574,7 +448,6 @@ def test_categories_set_at_init(): # test categories value works when transforming encoder = OrdinalEncoder( - top_n=None, categories=categories, handle_unknown="use_encoded_value", unknown_value=-1, @@ -614,7 +487,6 @@ def test_categories_includes_not_present_value(): # test categories value works when transforming encoder = OrdinalEncoder( - top_n=None, categories=categories, handle_unknown="use_encoded_value", unknown_value=-1, @@ -653,7 +525,6 @@ def test_categories_different_order_from_ltype(): # test categories value works when transforming encoder = OrdinalEncoder( - top_n=None, categories=categories, handle_unknown="use_encoded_value", unknown_value=-1, @@ -667,80 +538,6 @@ def test_categories_different_order_from_ltype(): assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] -def test_less_than_top_n_unique_values(): - X = pd.DataFrame( - { - "col_1": ["a", "b", "c", "d", "d"], - "col_2": ["a", "b", "a", "c", "b"], - "col_3": ["a", "a", "a", "a", "a"], - "col_4": [2, 0, 1, 0, 0], - }, - ) - X.ww.init( - logical_types={ - "col_1": Ordinal(order=["a", "b", "c", "d"]), - "col_2": Ordinal(order=["c", "b", "a"]), - "col_3": "categorical", - }, - ) - - encoder = OrdinalEncoder(top_n=5) - encoder.fit(X) - X_t = encoder.transform(X) - - assert set(X_t.columns) == { - "col_1_ordinally_encoded", - "col_2_ordinally_encoded", - "col_3", - "col_4", - } - pd.testing.assert_series_equal( - X_t["col_1_ordinally_encoded"], - pd.Series([0, 1, 2, 3, 3], name="col_1_ordinally_encoded", dtype="float64"), - ) - pd.testing.assert_series_equal( - X_t["col_2_ordinally_encoded"], - pd.Series([2, 1, 2, 0, 1], name="col_2_ordinally_encoded", dtype="float64"), - ) - - -def test_more_top_n_unique_values(): - # test that columns with >= n unique values encodes properly - X = pd.DataFrame( - { - "col_1": ["a", "b", "c", "d", "e", "f", "g"], - "col_2": ["a", "c", "d", "b", "e", "e", "f"], - "col_3": ["a", "a", "a", "a", "a", "a", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2], - }, - ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] - X = set_first_three_columns_to_ordinal_with_categories( - X, - categories=full_categories, - ) - - random_seed = 2 - - encoder = OrdinalEncoder( - top_n=5, - random_seed=random_seed, - handle_unknown="use_encoded_value", - unknown_value=-1, - ) - encoder.fit(X) - X_t = encoder.transform(X) - - # With random seed, selected categories are e, b, d, c, g - assert list(X_t["col_1_ordinally_encoded"]) == [-1, 0, 1, 2, 3, -1, 4] - assert list(X_t["col_2_ordinally_encoded"]) == [0, 2, 3, 1, 4, 4, -1] - assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] - - def test_numpy_input(): X = np.array([[2, 0, 1, 0, 0], [3, 2, 5, 1, 3]]) encoder = OrdinalEncoder() @@ -780,7 +577,7 @@ def test_ordinal_encoder_categories(): }, ) encoder = OrdinalEncoder( - top_n=2, + categories=[["a"], ["a", "b"]], handle_unknown="use_encoded_value", unknown_value=-1, ) @@ -810,9 +607,7 @@ def test_ordinal_encoder_get_feature_names(): "col_2": Ordinal(order=["a", "b", "c", "d"]), }, ) - ordinal_encoder = OrdinalEncoder( - top_n=2, handle_unknown="use_encoded_value", unknown_value=-1, ) @@ -845,8 +640,7 @@ def test_ordinal_encoder_features_to_encode(): "col_2": Ordinal(order=["a", "b", "c", "d"]), }, ) - - encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_1"]) + encoder = OrdinalEncoder(features_to_encode=["col_1"]) encoder.fit(X) X_t = encoder.transform(X) expected_col_names = set(["col_2", "col_1_ordinally_encoded"]) @@ -854,7 +648,7 @@ def test_ordinal_encoder_features_to_encode(): assert col_names == expected_col_names assert [X_t[col].dtype == "uint8" for col in X_t] - encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_1", "col_2"]) + encoder = OrdinalEncoder(features_to_encode=["col_1", "col_2"]) encoder.fit(X) X_t = encoder.transform(X) expected_col_names = set( @@ -874,7 +668,7 @@ def test_ordinal_encoder_features_to_encode_col_missing(): }, ) - encoder = OrdinalEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) + encoder = OrdinalEncoder(features_to_encode=["col_3", "col_4"]) with pytest.raises(ValueError, match="Could not find and encode"): encoder.fit(X) @@ -888,7 +682,7 @@ def test_ordinal_encoder_features_to_encode_no_col_names(): 1: Ordinal(order=[0, 1]), }, ) - encoder = OrdinalEncoder(top_n=5, features_to_encode=[0]) + encoder = OrdinalEncoder(features_to_encode=[0]) encoder.fit(X) X_t = encoder.transform(X) expected_col_names = set([1, "0_ordinally_encoded"]) @@ -897,39 +691,6 @@ def test_ordinal_encoder_features_to_encode_no_col_names(): assert [X_t[col].dtype == "uint8" for col in X_t] -def test_ordinal_encoder_top_n_categories_always_the_same(): - df = pd.DataFrame( - { - "categories": ["cat_1"] * 5 - + ["cat_2"] * 4 - + ["cat_3"] * 3 - + ["cat_4"] * 3 - + ["cat_5"] * 3, - "numbers": range(18), - }, - ) - df.ww.init( - logical_types={ - "categories": Ordinal(order=["cat_1", "cat_2", "cat_3", "cat_4", "cat_5"]), - }, - ) - - def check_df_equality(random_seed): - ordinal_encoder = OrdinalEncoder( - top_n=4, - random_seed=random_seed, - handle_unknown="use_encoded_value", - unknown_value=-1, - ) - - df1 = ordinal_encoder.fit_transform(df) - df2 = ordinal_encoder.fit_transform(df) - pd.testing.assert_frame_equal(df1, df2) - - check_df_equality(5) - check_df_equality(get_random_seed(5)) - - def test_ordinal_encoder_output_doubles(): X = pd.DataFrame( { @@ -996,22 +757,3 @@ def test_data_types(data_type): dtype="float64", ) pd.testing.assert_frame_equal(X_t, expected_df) - - -""" -Tests I didn't include from the ohe tests and why - -were not relevant to the ordinal encoder - - test_drop_first - - test_drop_binary - - test_drop_parameter_is_array - - test_drop_binary_and_top_n_2 - - test_ohe_column_names_unique -Couldn't understand the reason for - - test_categorical_dtype - - test_all_numerical_dtype - - test_ordinal_encoder_woodwork_custom_overrides_returned_by_components -Seemed redundant to other tests - - test_more_top_n_unique_values_large - - test_large_number_of_categories - kind of just another test of top_n arg -""" From 38a8e014d1e8add7c637ec75ad04589453fe75d5 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 30 Sep 2022 16:17:26 -0500 Subject: [PATCH 15/20] Shorter MR comments --- .../transformers/encoders/ordinal_encoder.py | 20 +++--- .../component_tests/test_ordinal_encoder.py | 70 ++++++++----------- 2 files changed, 39 insertions(+), 51 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 451b128f9d..9eae6cc80d 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -147,9 +147,10 @@ def fit(self, X, y=None): X_t = X ww_logical_types = X.ww.logical_types + categories = [] if len(self.features_to_encode) == 0: - # No ordinal features present - categories don't need to be specified - categories = "auto" + # No ordinal features present - no transformation can take place so return early + return self elif self.parameters["categories"] is not None: # Categories specified - make sure they match the ordinal columns input_categories = self.parameters["categories"] @@ -163,7 +164,6 @@ def fit(self, X, y=None): # Categories, as they're passed into SKOrdinalEncoder should be in the same order # as the data's Ordinal.order categories even if it's a subset - categories = [] for i, col_categories in enumerate(input_categories): categories_order = ww_logical_types[self.features_to_encode[i]].order @@ -173,7 +173,6 @@ def fit(self, X, y=None): categories.append(ordered_categories) else: # Categories unspecified - use ordered categories from a columns' Ordinal logical type - categories = [] for col in X_t[self.features_to_encode]: ltype = ww_logical_types[col] # Copy the order list, since we might mutate it later by adding nans @@ -184,10 +183,9 @@ def fit(self, X, y=None): # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder # requires any null values be present in the categories list if they are to be encoded as # missing values - if isinstance(categories, list): - for i, col in enumerate(X_t[self.features_to_encode]): - if X_t[col].isna().any(): - categories[i] += [np.nan] + for i, col in enumerate(X_t[self.features_to_encode]): + if X_t[col].isna().any(): + categories[i] += [np.nan] encoded_missing_value = self.parameters["encoded_missing_value"] if encoded_missing_value is None: @@ -238,7 +236,7 @@ def _get_feature_names(self): """Return feature names for the ordinal features after fitting. Since ordinal encoding creates one encoded feature per column in features_to_encode, feature - names are formatted as {column_name}_ordinally_encoded + names are formatted as {column_name}_ordinal_encoding Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. @@ -247,7 +245,7 @@ def _get_feature_names(self): unique_names = [] provenance = {} for col_name in self.features_to_encode: - encoded_name = f"{col_name}_ordinally_encoded" + encoded_name = f"{col_name}_ordinal_encoding" unique_names.append(encoded_name) provenance[col_name] = [encoded_name] self._provenance = provenance @@ -276,7 +274,7 @@ def categories(self, feature_name): def get_feature_names(self): """Return feature names for the ordinal features after fitting. - Feature names are formatted as {column name}_ordinally_encoded. + Feature names are formatted as {column name}_ordinal_encoding. Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 31ff688c5f..40fde163e5 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -157,7 +157,7 @@ def test_categories_specified_not_present_in_data(): X_2 = pd.DataFrame({"col_1": ["a", "b", "a", "c", "x"]}) X_2.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) X_t = encoder.transform(X_2) - assert list(X_t["col_1_ordinally_encoded"]) == [0, -1, 0, -1, 1] + assert list(X_t["col_1_ordinal_encoding"]) == [0, -1, 0, -1, 1] def test_ordinal_encoder_is_no_op_for_df_of_non_ordinal_features(): @@ -269,17 +269,17 @@ def test_ordinal_encoder_transform(): encoder.fit(X) X_t = encoder.transform(X) assert set(X_t.columns) == { - "col_1_ordinally_encoded", - "col_2_ordinally_encoded", + "col_1_ordinal_encoding", + "col_2_ordinal_encoding", "col_3", } pd.testing.assert_series_equal( - X_t["col_1_ordinally_encoded"], - pd.Series([0, 1, 2, 3, 3], name="col_1_ordinally_encoded", dtype="float64"), + X_t["col_1_ordinal_encoding"], + pd.Series([0, 1, 2, 3, 3], name="col_1_ordinal_encoding", dtype="float64"), ) pd.testing.assert_series_equal( - X_t["col_2_ordinally_encoded"], - pd.Series([2, 1, 2, 0, 1], name="col_2_ordinally_encoded", dtype="float64"), + X_t["col_2_ordinal_encoding"], + pd.Series([2, 1, 2, 0, 1], name="col_2_ordinal_encoding", dtype="float64"), ) @@ -311,13 +311,13 @@ def test_null_values_in_dataframe(): encoder = OrdinalEncoder() encoder.fit(X) X_t = encoder.transform(X) - assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) + assert pd.isna(X_t["col_1_ordinal_encoding"].iloc[-1]) # If we handle unknowns with an encoded value, the nan will be set to that value encoder = OrdinalEncoder(encoded_missing_value=-1) encoder.fit(X) X_t = encoder.transform(X) - assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 + assert X_t["col_1_ordinal_encoding"].iloc[-1] == -1 def test_ordinal_encoder_diff_na_types(): @@ -340,9 +340,9 @@ def test_ordinal_encoder_diff_na_types(): encoder.fit(X) X_t = encoder.transform(X) # Confirm were recognized as null and encoded - assert X_t["col_1_ordinally_encoded"].iloc[-1] == -1 - assert X_t["col_2_ordinally_encoded"].iloc[-1] == -1 - assert X_t["col_3_ordinally_encoded"].iloc[-1] == -1 + assert X_t["col_1_ordinal_encoding"].iloc[-1] == -1 + assert X_t["col_2_ordinal_encoding"].iloc[-1] == -1 + assert X_t["col_3_ordinal_encoding"].iloc[-1] == -1 def test_null_values_with_categories_specified(): @@ -380,9 +380,9 @@ def test_null_values_with_categories_specified(): encoder.fit(X) X_t = encoder.transform(X) # Check that the null values were handled as missing even when they're present in categories - assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-1]) - assert pd.isna(X_t["col_1_ordinally_encoded"].iloc[-2]) - assert pd.isna(X_t["col_2_ordinally_encoded"].iloc[-1]) + assert pd.isna(X_t["col_1_ordinal_encoding"].iloc[-1]) + assert pd.isna(X_t["col_1_ordinal_encoding"].iloc[-2]) + assert pd.isna(X_t["col_2_ordinal_encoding"].iloc[-1]) def test_handle_unknown(): @@ -456,9 +456,9 @@ def test_categories_set_at_init(): encoder.fit(X) X_t = encoder.transform(X) - assert list(X_t["col_1_ordinally_encoded"]) == [0, 1, 2, 3, -1, -1, -1] - assert list(X_t["col_2_ordinally_encoded"]) == [0, 2, -1, 1, -1, -1, -1] - assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] + assert list(X_t["col_1_ordinal_encoding"]) == [0, 1, 2, 3, -1, -1, -1] + assert list(X_t["col_2_ordinal_encoding"]) == [0, 2, -1, 1, -1, -1, -1] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 0, 0, 0, 1] def test_categories_includes_not_present_value(): @@ -533,17 +533,9 @@ def test_categories_different_order_from_ltype(): encoder.fit(X) X_t = encoder.transform(X) - assert list(X_t["col_1_ordinally_encoded"]) == [0, 1, 2, 3, -1, -1, -1] - assert list(X_t["col_2_ordinally_encoded"]) == [0, 2, -1, 1, -1, -1, -1] - assert list(X_t["col_3_ordinally_encoded"]) == [0, 0, 0, 0, 0, 0, 1] - - -def test_numpy_input(): - X = np.array([[2, 0, 1, 0, 0], [3, 2, 5, 1, 3]]) - encoder = OrdinalEncoder() - encoder.fit(X) - X_t = encoder.transform(X) - pd.testing.assert_frame_equal(pd.DataFrame(X), X_t, check_dtype=False) + assert list(X_t["col_1_ordinal_encoding"]) == [0, 1, 2, 3, -1, -1, -1] + assert list(X_t["col_2_ordinal_encoding"]) == [0, 2, -1, 1, -1, -1, -1] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 0, 0, 0, 1] @pytest.mark.parametrize( @@ -619,14 +611,14 @@ def test_ordinal_encoder_get_feature_names(): ordinal_encoder.fit(X) np.testing.assert_array_equal( ordinal_encoder.get_feature_names(), - np.array(["col_1_ordinally_encoded", "col_2_ordinally_encoded"]), + np.array(["col_1_ordinal_encoding", "col_2_ordinal_encoding"]), ) ordinal_encoder = OrdinalEncoder(features_to_encode=["col_2"]) ordinal_encoder.fit(X) np.testing.assert_array_equal( ordinal_encoder.get_feature_names(), - np.array(["col_2_ordinally_encoded"]), + np.array(["col_2_ordinal_encoding"]), ) @@ -643,7 +635,7 @@ def test_ordinal_encoder_features_to_encode(): encoder = OrdinalEncoder(features_to_encode=["col_1"]) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(["col_2", "col_1_ordinally_encoded"]) + expected_col_names = set(["col_2", "col_1_ordinal_encoding"]) col_names = set(X_t.columns) assert col_names == expected_col_names assert [X_t[col].dtype == "uint8" for col in X_t] @@ -652,7 +644,7 @@ def test_ordinal_encoder_features_to_encode(): encoder.fit(X) X_t = encoder.transform(X) expected_col_names = set( - ["col_1_ordinally_encoded", "col_2_ordinally_encoded"], + ["col_1_ordinal_encoding", "col_2_ordinal_encoding"], ) col_names = set(X_t.columns) assert col_names == expected_col_names @@ -685,7 +677,7 @@ def test_ordinal_encoder_features_to_encode_no_col_names(): encoder = OrdinalEncoder(features_to_encode=[0]) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set([1, "0_ordinally_encoded"]) + expected_col_names = set([1, "0_ordinal_encoding"]) col_names = set(X_t.columns) assert col_names == expected_col_names assert [X_t[col].dtype == "uint8" for col in X_t] @@ -719,15 +711,13 @@ def test_ordinal_encoder_output_doubles(): assert len(output.columns) == len(X.columns) -@pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"]) +@pytest.mark.parametrize("data_type", ["list", "np", "pd", "ww"]) def test_data_types(data_type): if data_type == "list": X = [["a"], ["b"], ["c"]] * 5 elif data_type == "np": X = np.array([["a"], ["b"], ["c"]] * 5) - elif data_type == "pd_no_index": - X = pd.DataFrame(["a", "b", "c"] * 5) - elif data_type == "pd_index": + elif data_type == "pd": X = pd.DataFrame(["a", "b", "c"] * 5, columns=[0]) elif data_type == "ww": X = pd.DataFrame(["a", "b", "c"] * 5) @@ -750,10 +740,10 @@ def test_data_types(data_type): ) pd.testing.assert_frame_equal(X_t, expected_df) else: - assert list(X_t.columns) == ["0_ordinally_encoded"] + assert list(X_t.columns) == ["0_ordinal_encoding"] expected_df = pd.DataFrame( [[0], [1], [2]] * 5, - columns=["0_ordinally_encoded"], + columns=["0_ordinal_encoding"], dtype="float64", ) pd.testing.assert_frame_equal(X_t, expected_df) From 4d81962419f4fcbc347e0b04d6b87980e2371a0b Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Mon, 3 Oct 2022 09:25:07 -0500 Subject: [PATCH 16/20] split errors in two --- .../transformers/encoders/ordinal_encoder.py | 14 ++++++++------ .../tests/component_tests/test_ordinal_encoder.py | 11 +++-------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 9eae6cc80d..3636a302e9 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -154,14 +154,16 @@ def fit(self, X, y=None): elif self.parameters["categories"] is not None: # Categories specified - make sure they match the ordinal columns input_categories = self.parameters["categories"] - if len(input_categories) != len(self.features_to_encode) or not isinstance( - input_categories[0], - list, - ): + + if len(input_categories) != len(self.features_to_encode): raise ValueError( - "Categories argument must contain a list of categories for each ordinal feature", + "Categories argument must contain as many elements as there are Ordinal features.", ) + if not all(isinstance(cats, list) for cats in input_categories): + raise ValueError( + "Each element of the categories argument must be a list.", + ) # Categories, as they're passed into SKOrdinalEncoder should be in the same order # as the data's Ordinal.order categories even if it's a subset for i, col_categories in enumerate(input_categories): @@ -210,7 +212,7 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed data, where each ordinal feature has been encoded into - a numerical column using where of ordinal integers represent + a numerical column where ordinal integers represent the relative order of categories. """ X = infer_feature_types(X) diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 40fde163e5..8eee387c5a 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -5,7 +5,6 @@ from evalml.exceptions import ComponentNotYetFittedError from evalml.pipelines.components import OrdinalEncoder -from evalml.utils import get_random_seed def set_first_three_columns_to_ordinal_with_categories(X, categories): @@ -68,16 +67,12 @@ def test_invalid_inputs(): [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]], ) encoder = OrdinalEncoder(categories=[["a", "b"], ["a", "c"]]) - error_msg = ( - "Categories argument must contain a list of categories for each ordinal feature" - ) + error_msg = "Categories argument must contain as many elements as there are Ordinal features." with pytest.raises(ValueError, match=error_msg): encoder.fit(X) - encoder = OrdinalEncoder(categories=["a", "b", "c"]) - error_msg = ( - "Categories argument must contain a list of categories for each ordinal feature" - ) + encoder = OrdinalEncoder(categories=[["a", "b"], 1, ["a"]]) + error_msg = "Each element of the categories argument must be a list." with pytest.raises(ValueError, match=error_msg): encoder.fit(X) From dd9f05b72d60e8b2f28798019f94282a6a8c522e Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Mon, 3 Oct 2022 09:46:25 -0500 Subject: [PATCH 17/20] Refactor transform to avoid unnecessary copying --- .../transformers/encoders/ordinal_encoder.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 3636a302e9..49c86230c2 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -217,22 +217,24 @@ def transform(self, X, y=None): """ X = infer_feature_types(X) - X_copy = X.ww.copy() - X = X.ww.drop(columns=self.features_to_encode) + if not self.features_to_encode: + # If there are no features to encode, X needs no transformation, so return a copy + return X.ww.copy() + + X_orig = X.ww.drop(columns=self.features_to_encode) # Call sklearn's transform on only the ordinal columns - if len(self.features_to_encode) > 0: - X_ord = pd.DataFrame( - self._encoder.transform(X_copy[self.features_to_encode]), - index=X_copy.index, - ) - X_ord.columns = self._get_feature_names() - X_ord.ww.init(logical_types={c: "Double" for c in X_ord.columns}) - self._feature_names = X_ord.columns + X_t = pd.DataFrame( + self._encoder.transform(X[self.features_to_encode]), + index=X.index, + ) + X_t.columns = self._get_feature_names() + X_t.ww.init(logical_types={c: "Double" for c in X_t.columns}) + self._feature_names = X_t.columns - X = ww.utils.concat_columns([X, X_ord]) + X_t = ww.utils.concat_columns([X_orig, X_t]) - return X + return X_t def _get_feature_names(self): """Return feature names for the ordinal features after fitting. From e82b13a95d0c22a570a20604b6824b1f48a71995 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Mon, 3 Oct 2022 17:08:50 -0500 Subject: [PATCH 18/20] MR comments --- .../transformers/encoders/ordinal_encoder.py | 25 ++++++++----------- .../component_tests/test_ordinal_encoder.py | 14 +++++------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 49c86230c2..7de0bb876b 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -45,7 +45,6 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): encoded_missing_value (int or np.nan): The value to use for missing (null) values seen during fit or transform. Defaults to np.nan. random_seed (int): Seed for the random number generator. Defaults to 0. - # --> is random seed still necessary? """ name = "Ordinal Encoder" @@ -60,7 +59,7 @@ def __init__( unknown_value=None, encoded_missing_value=None, random_seed=0, - **kwargs, # --> why do we have kwargs if they arent used?? Maybe needed for larger evalml pipeline + **kwargs, ): parameters = { "features_to_encode": features_to_encode, @@ -85,7 +84,7 @@ def __init__( ) self.features_to_encode = features_to_encode - self._encoder = None + self._component_obj = None super().__init__( parameters=parameters, @@ -144,8 +143,6 @@ def fit(self, X, y=None): col for col in X.columns if col in self.features_to_encode ] - X_t = X - ww_logical_types = X.ww.logical_types categories = [] if len(self.features_to_encode) == 0: @@ -175,7 +172,7 @@ def fit(self, X, y=None): categories.append(ordered_categories) else: # Categories unspecified - use ordered categories from a columns' Ordinal logical type - for col in X_t[self.features_to_encode]: + for col in X[self.features_to_encode]: ltype = ww_logical_types[col] # Copy the order list, since we might mutate it later by adding nans # and don't want to impact the Woodwork types @@ -185,22 +182,22 @@ def fit(self, X, y=None): # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder # requires any null values be present in the categories list if they are to be encoded as # missing values - for i, col in enumerate(X_t[self.features_to_encode]): - if X_t[col].isna().any(): + for i, col in enumerate(X[self.features_to_encode]): + if X[col].isna().any(): categories[i] += [np.nan] encoded_missing_value = self.parameters["encoded_missing_value"] if encoded_missing_value is None: encoded_missing_value = np.nan - self._encoder = SKOrdinalEncoder( + self._component_obj = SKOrdinalEncoder( categories=categories, handle_unknown=self.parameters["handle_unknown"], unknown_value=self.parameters["unknown_value"], encoded_missing_value=encoded_missing_value, ) - self._encoder.fit(X_t[self.features_to_encode]) + self._component_obj.fit(X[self.features_to_encode]) return self def transform(self, X, y=None): @@ -218,14 +215,14 @@ def transform(self, X, y=None): X = infer_feature_types(X) if not self.features_to_encode: - # If there are no features to encode, X needs no transformation, so return a copy - return X.ww.copy() + # If there are no features to encode, X needs no transformation + return X X_orig = X.ww.drop(columns=self.features_to_encode) # Call sklearn's transform on only the ordinal columns X_t = pd.DataFrame( - self._encoder.transform(X[self.features_to_encode]), + self._component_obj.transform(X[self.features_to_encode]), index=X.index, ) X_t.columns = self._get_feature_names() @@ -273,7 +270,7 @@ def categories(self, feature_name): raise ValueError( f'Feature "{feature_name}" was not provided to ordinal encoder as a training feature', ) - return self._encoder.categories_[index] + return self._component_obj.categories_[index] def get_feature_names(self): """Return feature names for the ordinal features after fitting. diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 8eee387c5a..2f52cc7840 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -93,7 +93,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]]) encoder.fit(X) - assert len(encoder._encoder.categories_) == len(encoder.features_to_encode) + assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode) error = 'Feature "col_1" was not provided to ordinal encoder as a training feature' with pytest.raises(ValueError, match=error): @@ -115,7 +115,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): ) encoder.fit(X) - assert len(encoder._encoder.categories_) == len(encoder.features_to_encode) + assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode) set(encoder.categories("col_2")) == {"a", "b", "c", "d"} set(encoder.categories("col_3")) == {"x", "y"} @@ -201,14 +201,14 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): encoder = OrdinalEncoder() encoder.fit(X) assert encoder.features_to_encode == ["col_1", "col_2", "col_3"] - assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_) + assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_) encoder = OrdinalEncoder(features_to_encode=["col_1"]) encoder.fit(X) assert encoder.features_to_encode == ["col_1"] - assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_) + assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_) expected_categories = [categories[0]] - for i, category_list in enumerate(encoder._encoder.categories_): + for i, category_list in enumerate(encoder._component_obj.categories_): assert list(category_list) == expected_categories[i] @@ -229,7 +229,7 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): # No parameters specified encoder = OrdinalEncoder() encoder.fit(X) - for i, category_list in enumerate(encoder._encoder.categories_): + for i, category_list in enumerate(encoder._component_obj.categories_): assert list(category_list) == categories[i] # Categories set at init explicitly - means we have to handle the unknown case @@ -240,7 +240,7 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): unknown_value=-1, ) encoder.fit(X) - for i, category_list in enumerate(encoder._encoder.categories_): + for i, category_list in enumerate(encoder._component_obj.categories_): assert list(category_list) == subset_categories[i] From 94ca85ce7117ea796431077dcb0886ad1fa3d470 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Tue, 4 Oct 2022 10:03:21 -0500 Subject: [PATCH 19/20] Make categories arg a dict --- .../transformers/encoders/ordinal_encoder.py | 47 ++++--- .../component_tests/test_ordinal_encoder.py | 124 +++++++++++------- 2 files changed, 99 insertions(+), 72 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 7de0bb876b..425dcab1d7 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -27,10 +27,9 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): Args: features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter. - categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories - for the column at index `i` in the dataframes passed in at fit and transform. - The order of categories specified for a column does not matter. - Any category not present in categories will be handled as an unknown value. + categories (dict[str, list[str]]): A dictionary mapping column names to their categories + in the dataframes passed in at fit and transform. The order of categories specified for a column does not matter. + Any category found in the data that is not present in categories will be handled as an unknown value. To not have unknown values raise an error, set handle_unknown to "use_encoded_value". Defaults to None. handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories @@ -139,12 +138,12 @@ def fit(self, X, y=None): ) # Put features_to_encode in the same relative order as the columns in the dataframe - self.features_to_encode = [ - col for col in X.columns if col in self.features_to_encode - ] + # self.features_to_encode = [ + # col for col in X.columns if col in self.features_to_encode + # ] ww_logical_types = X.ww.logical_types - categories = [] + categories = {} if len(self.features_to_encode) == 0: # No ordinal features present - no transformation can take place so return early return self @@ -154,44 +153,50 @@ def fit(self, X, y=None): if len(input_categories) != len(self.features_to_encode): raise ValueError( - "Categories argument must contain as many elements as there are Ordinal features.", + "Categories argument must contain as many elements as there are features to encode.", ) - if not all(isinstance(cats, list) for cats in input_categories): + if not all(isinstance(cats, list) for cats in input_categories.values()): raise ValueError( - "Each element of the categories argument must be a list.", + "Each of the values in the categories argument must be a list.", ) # Categories, as they're passed into SKOrdinalEncoder should be in the same order # as the data's Ordinal.order categories even if it's a subset - for i, col_categories in enumerate(input_categories): - categories_order = ww_logical_types[self.features_to_encode[i]].order + for col_name in self.features_to_encode: + col_categories = input_categories[col_name] + categories_order = ww_logical_types[col_name].order ordered_categories = [ cat for cat in categories_order if cat in col_categories ] - categories.append(ordered_categories) + categories[col_name] = ordered_categories else: # Categories unspecified - use ordered categories from a columns' Ordinal logical type - for col in X[self.features_to_encode]: - ltype = ww_logical_types[col] + for col_name in self.features_to_encode: + ltype = ww_logical_types[col_name] # Copy the order list, since we might mutate it later by adding nans # and don't want to impact the Woodwork types - categories.append(ltype.order.copy()) + categories[col_name] = ltype.order.copy() # Add any null values into the categories lists so that they aren't treated as unknown values # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder # requires any null values be present in the categories list if they are to be encoded as # missing values - for i, col in enumerate(X[self.features_to_encode]): - if X[col].isna().any(): - categories[i] += [np.nan] + for col_name in self.features_to_encode: + if X[col_name].isna().any(): + categories[col_name].append(np.nan) + + # sklearn needs categories to be a list in the order of the columns in features_to_encode + categories_for_sk_encoder = [ + categories[col_name] for col_name in self.features_to_encode + ] encoded_missing_value = self.parameters["encoded_missing_value"] if encoded_missing_value is None: encoded_missing_value = np.nan self._component_obj = SKOrdinalEncoder( - categories=categories, + categories=categories_for_sk_encoder, handle_unknown=self.parameters["handle_unknown"], unknown_value=self.parameters["unknown_value"], encoded_missing_value=encoded_missing_value, diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 2f52cc7840..780d60bad7 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -10,9 +10,9 @@ def set_first_three_columns_to_ordinal_with_categories(X, categories): X.ww.init( logical_types={ - "col_1": Ordinal(order=categories[0]), - "col_2": Ordinal(order=categories[1]), - "col_3": Ordinal(order=categories[2]), + "col_1": Ordinal(order=categories["col_1"]), + "col_2": Ordinal(order=categories["col_2"]), + "col_3": Ordinal(order=categories["col_3"]), }, ) return X @@ -64,15 +64,17 @@ def test_invalid_inputs(): ) X = set_first_three_columns_to_ordinal_with_categories( X, - [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]], + {"col_1": ["a", "b", "c", "d"], "col_2": ["a", "b", "c"], "col_3": ["a"]}, ) - encoder = OrdinalEncoder(categories=[["a", "b"], ["a", "c"]]) - error_msg = "Categories argument must contain as many elements as there are Ordinal features." + encoder = OrdinalEncoder(categories={"col_1": ["a", "b"], "col_2": ["a", "c"]}) + error_msg = "Categories argument must contain as many elements as there are features to encode." with pytest.raises(ValueError, match=error_msg): encoder.fit(X) - encoder = OrdinalEncoder(categories=[["a", "b"], 1, ["a"]]) - error_msg = "Each element of the categories argument must be a list." + encoder = OrdinalEncoder( + categories={"col_1": 1, "col_2": ["a", "c"], "col_3": ["a", "b"]}, + ) + error_msg = "Each of the values in the categories argument must be a list." with pytest.raises(ValueError, match=error_msg): encoder.fit(X) @@ -90,7 +92,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): ) X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) - encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]]) + encoder = OrdinalEncoder(categories={"col_2": ["a", "b", "c", "d"]}) encoder.fit(X) assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode) @@ -99,8 +101,8 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): with pytest.raises(ValueError, match=error): encoder.categories("col_1") - # When features_to_encode is passed in, confirm the order there doesn't matter and order of - # cols in X is still used in indexing into categories + # When features_to_encode is passed in, confirm the order there doesn't matter + # in indexing into categories_ X.ww.init( logical_types={ "col_2": Ordinal(order=["a", "b", "c", "d"]), @@ -110,8 +112,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): encoder = OrdinalEncoder( # features_to_encode passed in different order than the dataframe's cols features_to_encode=["col_3", "col_2"], - # categories' order still matches the dataframe's cols - categories=[["a", "b", "c", "d"], ["x", "y"]], + categories={"col_2": ["a", "b", "c", "d"], "col_3": ["x", "y"]}, ) encoder.fit(X) @@ -119,6 +120,10 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): set(encoder.categories("col_2")) == {"a", "b", "c", "d"} set(encoder.categories("col_3")) == {"x", "y"} + X_t = encoder.transform(X) + assert list(X_t["col_2_ordinal_encoding"]) == [0, 1, 0, 2, 3] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 1, 1] + def test_features_to_encode_non_ordinal_cols(): X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) @@ -142,7 +147,7 @@ def test_categories_specified_not_present_in_data(): X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) encoder = OrdinalEncoder( - categories=[["a", "x"]], + categories={"col_1": ["a", "x"]}, handle_unknown="use_encoded_value", unknown_value=-1, ) @@ -195,7 +200,11 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): encoder.fit(X) assert not encoder.features_to_encode - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) encoder = OrdinalEncoder() @@ -207,9 +216,9 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): encoder.fit(X) assert encoder.features_to_encode == ["col_1"] assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_) - expected_categories = [categories[0]] + expected_categories = {"col_1": categories["col_1"]} for i, category_list in enumerate(encoder._component_obj.categories_): - assert list(category_list) == expected_categories[i] + assert list(category_list) == expected_categories[f"col_{i + 1}"] def test_ordinal_encoder_categories_set_correctly_from_fit(): @@ -223,17 +232,21 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): "col_4": [1, 2, 3, 4, 5], }, ) - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) # No parameters specified encoder = OrdinalEncoder() encoder.fit(X) for i, category_list in enumerate(encoder._component_obj.categories_): - assert list(category_list) == categories[i] + assert list(category_list) == categories[f"col_{i + 1}"] # Categories set at init explicitly - means we have to handle the unknown case - subset_categories = [["a"], ["a"], ["a"]] + subset_categories = {"col_1": ["a"], "col_2": ["a"], "col_3": ["a"]} encoder = OrdinalEncoder( categories=subset_categories, handle_unknown="use_encoded_value", @@ -241,7 +254,7 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): ) encoder.fit(X) for i, category_list in enumerate(encoder._component_obj.categories_): - assert list(category_list) == subset_categories[i] + assert list(category_list) == subset_categories[f"col_{i + 1}"] def test_ordinal_encoder_transform(): @@ -368,7 +381,7 @@ def test_null_values_with_categories_specified(): # Try putting a nan in the categories list in one of the columns but not the other encoder = OrdinalEncoder( - categories=[["a"], ["a", np.nan]], + categories={"col_1": ["a"], "col_2": ["a", np.nan]}, handle_unknown="use_encoded_value", unknown_value=-1, ) @@ -389,11 +402,11 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) encoder = OrdinalEncoder(handle_unknown="error") @@ -408,11 +421,11 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - categories = [ - ["x", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + categories = { + "col_1": ["x", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) with pytest.raises(ValueError) as exec_info: # Using the encoder that was fit on data without x @@ -429,17 +442,22 @@ def test_categories_set_at_init(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories( X, categories=full_categories, ) - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a", "b"], + } # test categories value works when transforming encoder = OrdinalEncoder( @@ -467,18 +485,18 @@ def test_categories_includes_not_present_value(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories( X, categories=full_categories, ) # Categories passed in has value "x" that's not in the data - categories = [["a", "x"], ["a", "x"], ["a", "x"]] + categories = {"col_1": ["a", "x"], "col_2": ["a", "x"], "col_3": ["a", "x"]} # test categories value works when transforming encoder = OrdinalEncoder( @@ -505,18 +523,22 @@ def test_categories_different_order_from_ltype(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories( X, categories=full_categories, ) # The order doesn't match the full categories above but outputted data will still match above - categories = [["d", "a", "c", "b"], ["c", "b", "a"], ["b", "a"]] + categories = { + "col_1": ["d", "a", "c", "b"], + "col_2": ["c", "b", "a"], + "col_3": ["b", "a"], + } # test categories value works when transforming encoder = OrdinalEncoder( @@ -564,7 +586,7 @@ def test_ordinal_encoder_categories(): }, ) encoder = OrdinalEncoder( - categories=[["a"], ["a", "b"]], + categories={"col_1": ["a"], "col_2": ["a", "b"]}, handle_unknown="use_encoded_value", unknown_value=-1, ) From ee81b32ae6b06991bcbbf5d6b7c784366cb5a3fe Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Tue, 4 Oct 2022 15:27:47 -0500 Subject: [PATCH 20/20] remove commented code --- .../components/transformers/encoders/ordinal_encoder.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 425dcab1d7..937dfa0823 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -137,11 +137,6 @@ def fit(self, X, y=None): f"Column {col} specified in features_to_encode is not Ordinal in nature", ) - # Put features_to_encode in the same relative order as the columns in the dataframe - # self.features_to_encode = [ - # col for col in X.columns if col in self.features_to_encode - # ] - ww_logical_types = X.ww.logical_types categories = {} if len(self.features_to_encode) == 0: