diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index f5324d42bf..d009bad43c 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -20,6 +20,7 @@ Release Notes * Enhancements * Enhanced Decomposer with ``determine_periodicity`` function to automatically determine periodicity of seasonal target. :pr:`3729` * Enhanced Decomposer with ``set_seasonal_period`` function to set a ``Decomposer`` object's seasonal period automatically. :pr:`3729` + * Added ``OrdinalEncoder`` component :pr:`3736` * Fixes * Fixed holdout warning message showing when using default parameters :pr:`3727` * Fixed bug in Oversampler where categorical dtypes would fail :pr:`3732` diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py index 7c8e148502..4a5b71f102 100644 --- a/evalml/pipelines/__init__.py +++ b/evalml/pipelines/__init__.py @@ -43,6 +43,7 @@ VowpalWabbitRegressor, DropNaNRowsTransformer, TimeSeriesRegularizer, + OrdinalEncoder, ) from evalml.pipelines.component_graph import ComponentGraph diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index 22d3b82179..66225776a5 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -66,6 +66,7 @@ ReplaceNullableTypes, DropNaNRowsTransformer, TimeSeriesRegularizer, + OrdinalEncoder, ) from evalml.pipelines.components.ensemble import ( StackedEnsembleClassifier, diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py index 3c9f15285d..8f97550741 100644 --- a/evalml/pipelines/components/transformers/__init__.py +++ b/evalml/pipelines/components/transformers/__init__.py @@ -4,6 +4,7 @@ OneHotEncoder, TargetEncoder, LabelEncoder, + OrdinalEncoder, ) from evalml.pipelines.components.transformers.feature_selection import ( FeatureSelector, diff --git a/evalml/pipelines/components/transformers/encoders/__init__.py b/evalml/pipelines/components/transformers/encoders/__init__.py index 03c988ce22..7584b45ce9 100644 --- a/evalml/pipelines/components/transformers/encoders/__init__.py +++ b/evalml/pipelines/components/transformers/encoders/__init__.py @@ -6,3 +6,6 @@ TargetEncoder, ) from evalml.pipelines.components.transformers.encoders.label_encoder import LabelEncoder +from evalml.pipelines.components.transformers.encoders.ordinal_encoder import ( + OrdinalEncoder, +) diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py new file mode 100644 index 0000000000..937dfa0823 --- /dev/null +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -0,0 +1,286 @@ +"""A transformer that encodes ordinal features as an array of ordinal integers representing the relative order of categories.""" +import numpy as np +import pandas as pd +import woodwork as ww +from sklearn.preprocessing import OrdinalEncoder as SKOrdinalEncoder +from woodwork.logical_types import Ordinal + +from evalml.pipelines.components import ComponentBaseMeta +from evalml.pipelines.components.transformers.transformer import Transformer +from evalml.utils import infer_feature_types + +"""A transformer that encodes ordinal features.""" + + +class OrdinalEncoderMeta(ComponentBaseMeta): + """A version of the ComponentBaseMeta class which includes validation on an additional ordinal-encoder-specific method `categories`.""" + + METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + [ + "categories", + "get_feature_names", + ] + + +class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): + """A transformer that encodes ordinal features as an array of ordinal integers representing the relative order of categories. + + Args: + features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. + If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter. + categories (dict[str, list[str]]): A dictionary mapping column names to their categories + in the dataframes passed in at fit and transform. The order of categories specified for a column does not matter. + Any category found in the data that is not present in categories will be handled as an unknown value. + To not have unknown values raise an error, set handle_unknown to "use_encoded_value". + Defaults to None. + handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories + for a feature encountered during `fit` or `transform`. When set to "error", + an error will be raised when an unknown category is found. + When set to "use_encoded_value", unknown categories will be encoded as the value given + for the parameter unknown_value. Defaults to "error." + unknown_value (int or np.nan): The value to use for unknown categories seen during fit or transform. + Required when the parameter handle_unknown is set to "use_encoded_value." + The value has to be distinct from the values used to encode any of the categories in fit. + Defaults to None. + encoded_missing_value (int or np.nan): The value to use for missing (null) values seen during + fit or transform. Defaults to np.nan. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Ordinal Encoder" + hyperparameter_ranges = {} + """{}""" + + def __init__( + self, + features_to_encode=None, + categories=None, + handle_unknown="error", + unknown_value=None, + encoded_missing_value=None, + random_seed=0, + **kwargs, + ): + parameters = { + "features_to_encode": features_to_encode, + "categories": categories, + "handle_unknown": handle_unknown, + "unknown_value": unknown_value, + "encoded_missing_value": encoded_missing_value, + } + parameters.update(kwargs) + + # Check correct inputs + unknown_input_options = ["use_encoded_value", "error"] + if handle_unknown not in unknown_input_options: + raise ValueError( + "Invalid input {} for handle_unknown".format(handle_unknown), + ) + if handle_unknown == "use_encoded_value" and unknown_value is None: + raise ValueError( + "To use encoded value for unknown categories, unknown_value must" + "be specified as either np.nan or as an int that is distinct from" + "the other encoded categories ", + ) + + self.features_to_encode = features_to_encode + self._component_obj = None + + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + self._initial_state = self.random_seed + self._provenance = {} + + @staticmethod + def _get_ordinal_cols(X): + """Get names of ordinal columns in the input DataFrame.""" + return list(X.ww.select(include=["ordinal"], return_schema=True).columns) + + def fit(self, X, y=None): + """Fits the ordinal encoder component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + + Raises: + ValueError: If encoding a column failed. + TypeError: If non-Ordinal columns are specified in features_to_encode. + """ + # Ordinal type is not inferred by Woodwork, so if it wasn't set before, it won't be set at init + X = infer_feature_types(X) + if self.features_to_encode is None: + self.features_to_encode = self._get_ordinal_cols(X) + else: + # When features_to_encode is user-specified, check that all columns are present + # and have the Ordinal logical type + not_present_features = [ + col for col in self.features_to_encode if col not in list(X.columns) + ] + if len(not_present_features) > 0: + raise ValueError( + "Could not find and encode {} in input data.".format( + ", ".join(not_present_features), + ), + ) + + logical_types = X.ww.logical_types + for col in self.features_to_encode: + ltype = logical_types[col] + if not isinstance(ltype, Ordinal): + raise TypeError( + f"Column {col} specified in features_to_encode is not Ordinal in nature", + ) + + ww_logical_types = X.ww.logical_types + categories = {} + if len(self.features_to_encode) == 0: + # No ordinal features present - no transformation can take place so return early + return self + elif self.parameters["categories"] is not None: + # Categories specified - make sure they match the ordinal columns + input_categories = self.parameters["categories"] + + if len(input_categories) != len(self.features_to_encode): + raise ValueError( + "Categories argument must contain as many elements as there are features to encode.", + ) + + if not all(isinstance(cats, list) for cats in input_categories.values()): + raise ValueError( + "Each of the values in the categories argument must be a list.", + ) + # Categories, as they're passed into SKOrdinalEncoder should be in the same order + # as the data's Ordinal.order categories even if it's a subset + for col_name in self.features_to_encode: + col_categories = input_categories[col_name] + categories_order = ww_logical_types[col_name].order + + ordered_categories = [ + cat for cat in categories_order if cat in col_categories + ] + categories[col_name] = ordered_categories + else: + # Categories unspecified - use ordered categories from a columns' Ordinal logical type + for col_name in self.features_to_encode: + ltype = ww_logical_types[col_name] + # Copy the order list, since we might mutate it later by adding nans + # and don't want to impact the Woodwork types + categories[col_name] = ltype.order.copy() + + # Add any null values into the categories lists so that they aren't treated as unknown values + # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder + # requires any null values be present in the categories list if they are to be encoded as + # missing values + for col_name in self.features_to_encode: + if X[col_name].isna().any(): + categories[col_name].append(np.nan) + + # sklearn needs categories to be a list in the order of the columns in features_to_encode + categories_for_sk_encoder = [ + categories[col_name] for col_name in self.features_to_encode + ] + + encoded_missing_value = self.parameters["encoded_missing_value"] + if encoded_missing_value is None: + encoded_missing_value = np.nan + + self._component_obj = SKOrdinalEncoder( + categories=categories_for_sk_encoder, + handle_unknown=self.parameters["handle_unknown"], + unknown_value=self.parameters["unknown_value"], + encoded_missing_value=encoded_missing_value, + ) + + self._component_obj.fit(X[self.features_to_encode]) + return self + + def transform(self, X, y=None): + """Ordinally encode the input data. + + Args: + X (pd.DataFrame): Features to encode. + y (pd.Series): Ignored. + + Returns: + pd.DataFrame: Transformed data, where each ordinal feature has been encoded into + a numerical column where ordinal integers represent + the relative order of categories. + """ + X = infer_feature_types(X) + + if not self.features_to_encode: + # If there are no features to encode, X needs no transformation + return X + + X_orig = X.ww.drop(columns=self.features_to_encode) + + # Call sklearn's transform on only the ordinal columns + X_t = pd.DataFrame( + self._component_obj.transform(X[self.features_to_encode]), + index=X.index, + ) + X_t.columns = self._get_feature_names() + X_t.ww.init(logical_types={c: "Double" for c in X_t.columns}) + self._feature_names = X_t.columns + + X_t = ww.utils.concat_columns([X_orig, X_t]) + + return X_t + + def _get_feature_names(self): + """Return feature names for the ordinal features after fitting. + + Since ordinal encoding creates one encoded feature per column in features_to_encode, feature + names are formatted as {column_name}_ordinal_encoding + + Returns: + np.ndarray: The feature names after encoding, provided in the same order as input_features. + """ + self._features_to_drop = [] + unique_names = [] + provenance = {} + for col_name in self.features_to_encode: + encoded_name = f"{col_name}_ordinal_encoding" + unique_names.append(encoded_name) + provenance[col_name] = [encoded_name] + self._provenance = provenance + return unique_names + + def categories(self, feature_name): + """Returns a list of the unique categories to be encoded for the particular feature, in order. + + Args: + feature_name (str): The name of any feature provided to ordinal encoder during fit. + + Returns: + np.ndarray: The unique categories, in the same dtype as they were provided during fit. + + Raises: + ValueError: If feature was not provided to ordinal encoder as a training feature. + """ + try: + index = self.features_to_encode.index(feature_name) + except Exception: + raise ValueError( + f'Feature "{feature_name}" was not provided to ordinal encoder as a training feature', + ) + return self._component_obj.categories_[index] + + def get_feature_names(self): + """Return feature names for the ordinal features after fitting. + + Feature names are formatted as {column name}_ordinal_encoding. + + Returns: + np.ndarray: The feature names after encoding, provided in the same order as input_features. + """ + return self._get_feature_names() + + def _get_feature_provenance(self): + return self._provenance diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py new file mode 100644 index 0000000000..780d60bad7 --- /dev/null +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -0,0 +1,766 @@ +import numpy as np +import pandas as pd +import pytest +from woodwork.logical_types import Ordinal + +from evalml.exceptions import ComponentNotYetFittedError +from evalml.pipelines.components import OrdinalEncoder + + +def set_first_three_columns_to_ordinal_with_categories(X, categories): + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories["col_1"]), + "col_2": Ordinal(order=categories["col_2"]), + "col_3": Ordinal(order=categories["col_3"]), + }, + ) + return X + + +def test_init(): + parameters = { + "features_to_encode": None, + "categories": None, + "handle_unknown": "error", + "unknown_value": None, + "encoded_missing_value": None, + } + encoder = OrdinalEncoder() + assert encoder.parameters == parameters + + +def test_parameters(): + encoder = OrdinalEncoder(encoded_missing_value=-1) + expected_parameters = { + "features_to_encode": None, + "categories": None, + "handle_unknown": "error", + "unknown_value": None, + "encoded_missing_value": -1, + } + assert encoder.parameters == expected_parameters + + +def test_invalid_inputs(): + error_msg = "Invalid input {} for handle_unknown".format("bananas") + with pytest.raises(ValueError, match=error_msg): + encoder = OrdinalEncoder(handle_unknown="bananas") + + error_msg = ( + "To use encoded value for unknown categories, unknown_value must" + "be specified as either np.nan or as an int that is distinct from" + "the other encoded categories " + ) + with pytest.raises(ValueError, match=error_msg): + encoder = OrdinalEncoder(handle_unknown="use_encoded_value") + + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + X = set_first_three_columns_to_ordinal_with_categories( + X, + {"col_1": ["a", "b", "c", "d"], "col_2": ["a", "b", "c"], "col_3": ["a"]}, + ) + encoder = OrdinalEncoder(categories={"col_1": ["a", "b"], "col_2": ["a", "c"]}) + error_msg = "Categories argument must contain as many elements as there are features to encode." + with pytest.raises(ValueError, match=error_msg): + encoder.fit(X) + + encoder = OrdinalEncoder( + categories={"col_1": 1, "col_2": ["a", "c"], "col_3": ["a", "b"]}, + ) + error_msg = "Each of the values in the categories argument must be a list." + with pytest.raises(ValueError, match=error_msg): + encoder.fit(X) + + +def test_categories_list_not_passed_in_for_non_ordinal_column(): + """We indicate that the categories argument must contain categories only for each ordinal + feature, so test the case where we pass in a categories list when not every column is ordinal. + """ + X = pd.DataFrame( + { + "col_1": [2, 0, 1, 0, 0], + "col_2": ["a", "b", "a", "c", "d"], + "col_3": ["x", "x", "x", "y", "y"], + }, + ) + X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) + + encoder = OrdinalEncoder(categories={"col_2": ["a", "b", "c", "d"]}) + encoder.fit(X) + + assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode) + + error = 'Feature "col_1" was not provided to ordinal encoder as a training feature' + with pytest.raises(ValueError, match=error): + encoder.categories("col_1") + + # When features_to_encode is passed in, confirm the order there doesn't matter + # in indexing into categories_ + X.ww.init( + logical_types={ + "col_2": Ordinal(order=["a", "b", "c", "d"]), + "col_3": Ordinal(order=["x", "y"]), + }, + ) + encoder = OrdinalEncoder( + # features_to_encode passed in different order than the dataframe's cols + features_to_encode=["col_3", "col_2"], + categories={"col_2": ["a", "b", "c", "d"], "col_3": ["x", "y"]}, + ) + encoder.fit(X) + + assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode) + set(encoder.categories("col_2")) == {"a", "b", "c", "d"} + set(encoder.categories("col_3")) == {"x", "y"} + + X_t = encoder.transform(X) + assert list(X_t["col_2_ordinal_encoding"]) == [0, 1, 0, 2, 3] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 1, 1] + + +def test_features_to_encode_non_ordinal_cols(): + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) + + encoder = OrdinalEncoder(features_to_encode=["col_1"]) + error = "Column col_1 specified in features_to_encode is not Ordinal in nature" + with pytest.raises(TypeError, match=error): + encoder.fit(X) + + +def test_categories_specified_not_present_in_data(): + """Make sure that we can handle categories during fit that aren't present in + the data so that they can be seen during transform. Note that because we fit on the + Ordinal.order passed in at fit, that order is the source of truth for + potential categories available at transform. In this test, that means that "x", + though not in the data at fit, must be in the order in order to not be viewed + as an unknown value at transform. + """ + X = pd.DataFrame({"col_1": ["a", "b", "a", "c", "d"]}) + X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) + + encoder = OrdinalEncoder( + categories={"col_1": ["a", "x"]}, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + assert set(encoder.categories("col_1")) == {"a", "x"} + + X_2 = pd.DataFrame({"col_1": ["a", "b", "a", "c", "x"]}) + X_2.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) + X_t = encoder.transform(X_2) + assert list(X_t["col_1_ordinal_encoding"]) == [0, -1, 0, -1, 1] + + +def test_ordinal_encoder_is_no_op_for_df_of_non_ordinal_features(): + encoder = OrdinalEncoder(handle_missing="error") + X = pd.DataFrame( + { + "col_1": [1.2, 3.2, None, 4.7], + "col_2": [4.5, 8.9, 11.2, 23.4], + "col_3": [True, False, True, True], + "col_4": [ + "a", + "b", + "a", + "c", + ], + }, + ) + X.ww.init( + logical_types={ + "col_1": "Double", + "col_2": "Integer", + "col_3": "Boolean", + "col_4": "Categorical", + }, + ) + X_t = encoder.fit_transform(X) + pd.testing.assert_frame_equal(X_t, X) + + +def test_ordinal_encoder_recognizes_ordinal_columns(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "b", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [1, 2, 3, 4, 5], + }, + ) + encoder = OrdinalEncoder() + encoder.fit(X) + assert not encoder.features_to_encode + + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a"], + } + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + + encoder = OrdinalEncoder() + encoder.fit(X) + assert encoder.features_to_encode == ["col_1", "col_2", "col_3"] + assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_) + + encoder = OrdinalEncoder(features_to_encode=["col_1"]) + encoder.fit(X) + assert encoder.features_to_encode == ["col_1"] + assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_) + expected_categories = {"col_1": categories["col_1"]} + for i, category_list in enumerate(encoder._component_obj.categories_): + assert list(category_list) == expected_categories[f"col_{i + 1}"] + + +def test_ordinal_encoder_categories_set_correctly_from_fit(): + # The SKOrdinalEncoder.categories_ attribute is what determines what gets encoded + # So we're checking how that gets set during fit + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "b", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [1, 2, 3, 4, 5], + }, + ) + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a"], + } + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + + # No parameters specified + encoder = OrdinalEncoder() + encoder.fit(X) + for i, category_list in enumerate(encoder._component_obj.categories_): + assert list(category_list) == categories[f"col_{i + 1}"] + + # Categories set at init explicitly - means we have to handle the unknown case + subset_categories = {"col_1": ["a"], "col_2": ["a"], "col_3": ["a"]} + encoder = OrdinalEncoder( + categories=subset_categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + for i, category_list in enumerate(encoder._component_obj.categories_): + assert list(category_list) == subset_categories[f"col_{i + 1}"] + + +def test_ordinal_encoder_transform(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "d"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a", "b", "c", "d"]), + # Order is not alphabetical + "col_2": Ordinal(order=["c", "b", "a"]), + "col_3": "categorical", + }, + ) + encoder = OrdinalEncoder(handle_missing="as_category") + encoder.fit(X) + X_t = encoder.transform(X) + assert set(X_t.columns) == { + "col_1_ordinal_encoding", + "col_2_ordinal_encoding", + "col_3", + } + pd.testing.assert_series_equal( + X_t["col_1_ordinal_encoding"], + pd.Series([0, 1, 2, 3, 3], name="col_1_ordinal_encoding", dtype="float64"), + ) + pd.testing.assert_series_equal( + X_t["col_2_ordinal_encoding"], + pd.Series([2, 1, 2, 0, 1], name="col_2_ordinal_encoding", dtype="float64"), + ) + + +def test_null_values_in_dataframe(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": [ + "a", + "b", + "a", + "c", + "c", + ], + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + # Note - we cant include the null value in the categories used by Woodwork + # because it sets the pandas dtypes' categories and they can't include a null value + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + }, + ) + + # With no args set, nan doesn't get encoded into any value + encoder = OrdinalEncoder() + encoder.fit(X) + X_t = encoder.transform(X) + assert pd.isna(X_t["col_1_ordinal_encoding"].iloc[-1]) + + # If we handle unknowns with an encoded value, the nan will be set to that value + encoder = OrdinalEncoder(encoded_missing_value=-1) + encoder.fit(X) + X_t = encoder.transform(X) + assert X_t["col_1_ordinal_encoding"].iloc[-1] == -1 + + +def test_ordinal_encoder_diff_na_types(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": ["a", "b", "a", "c", None], + "col_3": ["a", "a", "a", "a", pd.NA], + }, + ) + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + "col_3": Ordinal(order=categories[2]), + }, + ) + encoder = OrdinalEncoder(encoded_missing_value=-1) + encoder.fit(X) + X_t = encoder.transform(X) + # Confirm were recognized as null and encoded + assert X_t["col_1_ordinal_encoding"].iloc[-1] == -1 + assert X_t["col_2_ordinal_encoding"].iloc[-1] == -1 + assert X_t["col_3_ordinal_encoding"].iloc[-1] == -1 + + +def test_null_values_with_categories_specified(): + """Nans aren't treated by Woodwork as categories in ordinal cols, so they shouldn't + have an impact on the categories parameter and be handled entirely independently.""" + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", np.nan, np.nan], + "col_2": [ + "a", + "b", + "a", + "c", + np.nan, + ], + "col_3": ["a", "a", "a", "a", "a"], + }, + ) + # Note - we cant include the null value in the categories used by Woodwork + # because it sets the pandas dtypes' categories and they can't include a null value + categories = [["a", "b", "c", "d"], ["a", "b", "c"]] + X.ww.init( + logical_types={ + "col_1": Ordinal(order=categories[0]), + "col_2": Ordinal(order=categories[1]), + }, + ) + + # Try putting a nan in the categories list in one of the columns but not the other + encoder = OrdinalEncoder( + categories={"col_1": ["a"], "col_2": ["a", np.nan]}, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + encoder.fit(X) + X_t = encoder.transform(X) + # Check that the null values were handled as missing even when they're present in categories + assert pd.isna(X_t["col_1_ordinal_encoding"].iloc[-1]) + assert pd.isna(X_t["col_1_ordinal_encoding"].iloc[-2]) + assert pd.isna(X_t["col_2_ordinal_encoding"].iloc[-1]) + + +def test_handle_unknown(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + + encoder = OrdinalEncoder(handle_unknown="error") + encoder.fit(X) + assert isinstance(encoder.transform(X), pd.DataFrame) + + X = pd.DataFrame( + { + "col_1": ["x", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + categories = { + "col_1": ["x", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } + X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) + with pytest.raises(ValueError) as exec_info: + # Using the encoder that was fit on data without x + encoder.transform(X) + assert "Found unknown categories" in exec_info.value.args[0] + + +def test_categories_set_at_init(): + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) + + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a", "b"], + } + + # test categories value works when transforming + encoder = OrdinalEncoder( + categories=categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + random_seed=2, + ) + encoder.fit(X) + X_t = encoder.transform(X) + + assert list(X_t["col_1_ordinal_encoding"]) == [0, 1, 2, 3, -1, -1, -1] + assert list(X_t["col_2_ordinal_encoding"]) == [0, 2, -1, 1, -1, -1, -1] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 0, 0, 0, 1] + + +def test_categories_includes_not_present_value(): + """This tests the case where the categories we pass into the encoder include + values that aren't in the data or even the Ordinal.order.""" + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) + + # Categories passed in has value "x" that's not in the data + categories = {"col_1": ["a", "x"], "col_2": ["a", "x"], "col_3": ["a", "x"]} + + # test categories value works when transforming + encoder = OrdinalEncoder( + categories=categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + random_seed=2, + ) + encoder.fit(X) + assert set(encoder.categories("col_1")) == {"a"} + assert set(encoder.categories("col_2")) == {"a"} + assert set(encoder.categories("col_3")) == {"a"} + + +def test_categories_different_order_from_ltype(): + # The order of categories comes from the Ordinal.order property of the data. + # Categories passed in as input to the encoder just determine what subset should + # be used. + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + }, + ) + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } + X = set_first_three_columns_to_ordinal_with_categories( + X, + categories=full_categories, + ) + + # The order doesn't match the full categories above but outputted data will still match above + categories = { + "col_1": ["d", "a", "c", "b"], + "col_2": ["c", "b", "a"], + "col_3": ["b", "a"], + } + + # test categories value works when transforming + encoder = OrdinalEncoder( + categories=categories, + handle_unknown="use_encoded_value", + unknown_value=-1, + random_seed=2, + ) + encoder.fit(X) + X_t = encoder.transform(X) + + assert list(X_t["col_1_ordinal_encoding"]) == [0, 1, 2, 3, -1, -1, -1] + assert list(X_t["col_2_ordinal_encoding"]) == [0, 2, -1, 1, -1, -1, -1] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 0, 0, 0, 1] + + +@pytest.mark.parametrize( + "index", + [ + list(range(-5, 0)), + list(range(100, 105)), + [f"row_{i}" for i in range(5)], + pd.date_range("2020-09-08", periods=5), + ], +) +def test_ordinal_encoder_preserves_custom_index(index): + df = pd.DataFrame( + {"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)}, + index=index, + ) + encoder = OrdinalEncoder() + new_df = encoder.fit_transform(df) + pd.testing.assert_index_equal(new_df.index, df.index) + assert not new_df.isna().any(axis=None) + + +def test_ordinal_encoder_categories(): + X = pd.DataFrame( + {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a"]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + encoder = OrdinalEncoder( + categories={"col_1": ["a"], "col_2": ["a", "b"]}, + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + with pytest.raises( + ComponentNotYetFittedError, + match="This OrdinalEncoder is not fitted yet. You must fit OrdinalEncoder before calling categories.", + ): + encoder.categories("col_1") + + encoder.fit(X) + np.testing.assert_array_equal(encoder.categories("col_1"), np.array(["a"])) + np.testing.assert_array_equal(encoder.categories("col_2"), np.array(["a", "b"])) + with pytest.raises( + ValueError, + match='Feature "col_12345" was not provided to ordinal encoder as a training feature', + ): + encoder.categories("col_12345") + + +def test_ordinal_encoder_get_feature_names(): + X = pd.DataFrame( + {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}, + ) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=["a"]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + ordinal_encoder = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=-1, + ) + with pytest.raises( + ComponentNotYetFittedError, + match="This OrdinalEncoder is not fitted yet. You must fit OrdinalEncoder before calling get_feature_names.", + ): + ordinal_encoder.get_feature_names() + ordinal_encoder.fit(X) + np.testing.assert_array_equal( + ordinal_encoder.get_feature_names(), + np.array(["col_1_ordinal_encoding", "col_2_ordinal_encoding"]), + ) + + ordinal_encoder = OrdinalEncoder(features_to_encode=["col_2"]) + ordinal_encoder.fit(X) + np.testing.assert_array_equal( + ordinal_encoder.get_feature_names(), + np.array(["col_2_ordinal_encoding"]), + ) + + +def test_ordinal_encoder_features_to_encode(): + # Test feature that doesn't need encoding and + # feature that needs encoding but is not specified remain untouched + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=[0, 1, 2]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + encoder = OrdinalEncoder(features_to_encode=["col_1"]) + encoder.fit(X) + X_t = encoder.transform(X) + expected_col_names = set(["col_2", "col_1_ordinal_encoding"]) + col_names = set(X_t.columns) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] + + encoder = OrdinalEncoder(features_to_encode=["col_1", "col_2"]) + encoder.fit(X) + X_t = encoder.transform(X) + expected_col_names = set( + ["col_1_ordinal_encoding", "col_2_ordinal_encoding"], + ) + col_names = set(X_t.columns) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] + + +def test_ordinal_encoder_features_to_encode_col_missing(): + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) + X.ww.init( + logical_types={ + "col_1": Ordinal(order=[0, 1, 2]), + "col_2": Ordinal(order=["a", "b", "c", "d"]), + }, + ) + + encoder = OrdinalEncoder(features_to_encode=["col_3", "col_4"]) + + with pytest.raises(ValueError, match="Could not find and encode"): + encoder.fit(X) + + +def test_ordinal_encoder_features_to_encode_no_col_names(): + X = pd.DataFrame([["b", 0], ["a", 1], ["b", 1]]) + X.ww.init( + logical_types={ + 0: Ordinal(order=["b", "a"]), + 1: Ordinal(order=[0, 1]), + }, + ) + encoder = OrdinalEncoder(features_to_encode=[0]) + encoder.fit(X) + X_t = encoder.transform(X) + expected_col_names = set([1, "0_ordinal_encoding"]) + col_names = set(X_t.columns) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] + + +def test_ordinal_encoder_output_doubles(): + X = pd.DataFrame( + { + "bool": [bool(i % 2) for i in range(100)], + "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, + "integers": [i for i in range(100)], + "doubles": [i * 1.0 for i in range(100)], + }, + ) + X.ww.init( + logical_types={ + "categorical": Ordinal(order=["dog", "cat", "fish"]), + }, + ) + y = pd.Series([i % 2 for i in range(100)]) + y.ww.init() + ordinal_encoder = OrdinalEncoder() + output = ordinal_encoder.fit_transform(X, y) + for name, types in output.ww.types["Logical Type"].items(): + if name == "integers": + assert str(types) == "Integer" + elif name == "bool": + assert str(types) == "Boolean" + else: + assert str(types) == "Double" + assert len(output.columns) == len(X.columns) + + +@pytest.mark.parametrize("data_type", ["list", "np", "pd", "ww"]) +def test_data_types(data_type): + if data_type == "list": + X = [["a"], ["b"], ["c"]] * 5 + elif data_type == "np": + X = np.array([["a"], ["b"], ["c"]] * 5) + elif data_type == "pd": + X = pd.DataFrame(["a", "b", "c"] * 5, columns=[0]) + elif data_type == "ww": + X = pd.DataFrame(["a", "b", "c"] * 5) + X.ww.init( + logical_types={ + 0: Ordinal(order=["a", "b", "c"]), + }, + ) + encoder = OrdinalEncoder() + encoder.fit(X) + X_t = encoder.transform(X) + + if data_type != "ww": + # Woodwork wont infer Ordinal, so none of the other types will encode features + assert not encoder.features_to_encode + expected_df = pd.DataFrame( + [["a"], ["b"], ["c"]] * 5, + columns=[0], + dtype="category", + ) + pd.testing.assert_frame_equal(X_t, expected_df) + else: + assert list(X_t.columns) == ["0_ordinal_encoding"] + expected_df = pd.DataFrame( + [[0], [1], [2]] * 5, + columns=["0_ordinal_encoding"], + dtype="float64", + ) + pd.testing.assert_frame_equal(X_t, expected_df) diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index d8af59e978..aac9e16f20 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -87,6 +87,7 @@ "LightGBM Classifier", "LightGBM Regressor", "Oversampler", + "Ordinal Encoder", "Polynomial Decomposer", "Prophet Regressor", "Target Encoder",