diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py index 7de0bb876b..425dcab1d7 100644 --- a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py @@ -27,10 +27,9 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta): Args: features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter. - categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories - for the column at index `i` in the dataframes passed in at fit and transform. - The order of categories specified for a column does not matter. - Any category not present in categories will be handled as an unknown value. + categories (dict[str, list[str]]): A dictionary mapping column names to their categories + in the dataframes passed in at fit and transform. The order of categories specified for a column does not matter. + Any category found in the data that is not present in categories will be handled as an unknown value. To not have unknown values raise an error, set handle_unknown to "use_encoded_value". Defaults to None. handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories @@ -139,12 +138,12 @@ def fit(self, X, y=None): ) # Put features_to_encode in the same relative order as the columns in the dataframe - self.features_to_encode = [ - col for col in X.columns if col in self.features_to_encode - ] + # self.features_to_encode = [ + # col for col in X.columns if col in self.features_to_encode + # ] ww_logical_types = X.ww.logical_types - categories = [] + categories = {} if len(self.features_to_encode) == 0: # No ordinal features present - no transformation can take place so return early return self @@ -154,44 +153,50 @@ def fit(self, X, y=None): if len(input_categories) != len(self.features_to_encode): raise ValueError( - "Categories argument must contain as many elements as there are Ordinal features.", + "Categories argument must contain as many elements as there are features to encode.", ) - if not all(isinstance(cats, list) for cats in input_categories): + if not all(isinstance(cats, list) for cats in input_categories.values()): raise ValueError( - "Each element of the categories argument must be a list.", + "Each of the values in the categories argument must be a list.", ) # Categories, as they're passed into SKOrdinalEncoder should be in the same order # as the data's Ordinal.order categories even if it's a subset - for i, col_categories in enumerate(input_categories): - categories_order = ww_logical_types[self.features_to_encode[i]].order + for col_name in self.features_to_encode: + col_categories = input_categories[col_name] + categories_order = ww_logical_types[col_name].order ordered_categories = [ cat for cat in categories_order if cat in col_categories ] - categories.append(ordered_categories) + categories[col_name] = ordered_categories else: # Categories unspecified - use ordered categories from a columns' Ordinal logical type - for col in X[self.features_to_encode]: - ltype = ww_logical_types[col] + for col_name in self.features_to_encode: + ltype = ww_logical_types[col_name] # Copy the order list, since we might mutate it later by adding nans # and don't want to impact the Woodwork types - categories.append(ltype.order.copy()) + categories[col_name] = ltype.order.copy() # Add any null values into the categories lists so that they aren't treated as unknown values # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder # requires any null values be present in the categories list if they are to be encoded as # missing values - for i, col in enumerate(X[self.features_to_encode]): - if X[col].isna().any(): - categories[i] += [np.nan] + for col_name in self.features_to_encode: + if X[col_name].isna().any(): + categories[col_name].append(np.nan) + + # sklearn needs categories to be a list in the order of the columns in features_to_encode + categories_for_sk_encoder = [ + categories[col_name] for col_name in self.features_to_encode + ] encoded_missing_value = self.parameters["encoded_missing_value"] if encoded_missing_value is None: encoded_missing_value = np.nan self._component_obj = SKOrdinalEncoder( - categories=categories, + categories=categories_for_sk_encoder, handle_unknown=self.parameters["handle_unknown"], unknown_value=self.parameters["unknown_value"], encoded_missing_value=encoded_missing_value, diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py index 2f52cc7840..780d60bad7 100644 --- a/evalml/tests/component_tests/test_ordinal_encoder.py +++ b/evalml/tests/component_tests/test_ordinal_encoder.py @@ -10,9 +10,9 @@ def set_first_three_columns_to_ordinal_with_categories(X, categories): X.ww.init( logical_types={ - "col_1": Ordinal(order=categories[0]), - "col_2": Ordinal(order=categories[1]), - "col_3": Ordinal(order=categories[2]), + "col_1": Ordinal(order=categories["col_1"]), + "col_2": Ordinal(order=categories["col_2"]), + "col_3": Ordinal(order=categories["col_3"]), }, ) return X @@ -64,15 +64,17 @@ def test_invalid_inputs(): ) X = set_first_three_columns_to_ordinal_with_categories( X, - [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]], + {"col_1": ["a", "b", "c", "d"], "col_2": ["a", "b", "c"], "col_3": ["a"]}, ) - encoder = OrdinalEncoder(categories=[["a", "b"], ["a", "c"]]) - error_msg = "Categories argument must contain as many elements as there are Ordinal features." + encoder = OrdinalEncoder(categories={"col_1": ["a", "b"], "col_2": ["a", "c"]}) + error_msg = "Categories argument must contain as many elements as there are features to encode." with pytest.raises(ValueError, match=error_msg): encoder.fit(X) - encoder = OrdinalEncoder(categories=[["a", "b"], 1, ["a"]]) - error_msg = "Each element of the categories argument must be a list." + encoder = OrdinalEncoder( + categories={"col_1": 1, "col_2": ["a", "c"], "col_3": ["a", "b"]}, + ) + error_msg = "Each of the values in the categories argument must be a list." with pytest.raises(ValueError, match=error_msg): encoder.fit(X) @@ -90,7 +92,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): ) X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])}) - encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]]) + encoder = OrdinalEncoder(categories={"col_2": ["a", "b", "c", "d"]}) encoder.fit(X) assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode) @@ -99,8 +101,8 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): with pytest.raises(ValueError, match=error): encoder.categories("col_1") - # When features_to_encode is passed in, confirm the order there doesn't matter and order of - # cols in X is still used in indexing into categories + # When features_to_encode is passed in, confirm the order there doesn't matter + # in indexing into categories_ X.ww.init( logical_types={ "col_2": Ordinal(order=["a", "b", "c", "d"]), @@ -110,8 +112,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): encoder = OrdinalEncoder( # features_to_encode passed in different order than the dataframe's cols features_to_encode=["col_3", "col_2"], - # categories' order still matches the dataframe's cols - categories=[["a", "b", "c", "d"], ["x", "y"]], + categories={"col_2": ["a", "b", "c", "d"], "col_3": ["x", "y"]}, ) encoder.fit(X) @@ -119,6 +120,10 @@ def test_categories_list_not_passed_in_for_non_ordinal_column(): set(encoder.categories("col_2")) == {"a", "b", "c", "d"} set(encoder.categories("col_3")) == {"x", "y"} + X_t = encoder.transform(X) + assert list(X_t["col_2_ordinal_encoding"]) == [0, 1, 0, 2, 3] + assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 1, 1] + def test_features_to_encode_non_ordinal_cols(): X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) @@ -142,7 +147,7 @@ def test_categories_specified_not_present_in_data(): X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])}) encoder = OrdinalEncoder( - categories=[["a", "x"]], + categories={"col_1": ["a", "x"]}, handle_unknown="use_encoded_value", unknown_value=-1, ) @@ -195,7 +200,11 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): encoder.fit(X) assert not encoder.features_to_encode - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) encoder = OrdinalEncoder() @@ -207,9 +216,9 @@ def test_ordinal_encoder_recognizes_ordinal_columns(): encoder.fit(X) assert encoder.features_to_encode == ["col_1"] assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_) - expected_categories = [categories[0]] + expected_categories = {"col_1": categories["col_1"]} for i, category_list in enumerate(encoder._component_obj.categories_): - assert list(category_list) == expected_categories[i] + assert list(category_list) == expected_categories[f"col_{i + 1}"] def test_ordinal_encoder_categories_set_correctly_from_fit(): @@ -223,17 +232,21 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): "col_4": [1, 2, 3, 4, 5], }, ) - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]] + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) # No parameters specified encoder = OrdinalEncoder() encoder.fit(X) for i, category_list in enumerate(encoder._component_obj.categories_): - assert list(category_list) == categories[i] + assert list(category_list) == categories[f"col_{i + 1}"] # Categories set at init explicitly - means we have to handle the unknown case - subset_categories = [["a"], ["a"], ["a"]] + subset_categories = {"col_1": ["a"], "col_2": ["a"], "col_3": ["a"]} encoder = OrdinalEncoder( categories=subset_categories, handle_unknown="use_encoded_value", @@ -241,7 +254,7 @@ def test_ordinal_encoder_categories_set_correctly_from_fit(): ) encoder.fit(X) for i, category_list in enumerate(encoder._component_obj.categories_): - assert list(category_list) == subset_categories[i] + assert list(category_list) == subset_categories[f"col_{i + 1}"] def test_ordinal_encoder_transform(): @@ -368,7 +381,7 @@ def test_null_values_with_categories_specified(): # Try putting a nan in the categories list in one of the columns but not the other encoder = OrdinalEncoder( - categories=[["a"], ["a", np.nan]], + categories={"col_1": ["a"], "col_2": ["a", np.nan]}, handle_unknown="use_encoded_value", unknown_value=-1, ) @@ -389,11 +402,11 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) encoder = OrdinalEncoder(handle_unknown="error") @@ -408,11 +421,11 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - categories = [ - ["x", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + categories = { + "col_1": ["x", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories) with pytest.raises(ValueError) as exec_info: # Using the encoder that was fit on data without x @@ -429,17 +442,22 @@ def test_categories_set_at_init(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories( X, categories=full_categories, ) - categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] + categories = { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "b", "c"], + "col_3": ["a", "b"], + } # test categories value works when transforming encoder = OrdinalEncoder( @@ -467,18 +485,18 @@ def test_categories_includes_not_present_value(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories( X, categories=full_categories, ) # Categories passed in has value "x" that's not in the data - categories = [["a", "x"], ["a", "x"], ["a", "x"]] + categories = {"col_1": ["a", "x"], "col_2": ["a", "x"], "col_3": ["a", "x"]} # test categories value works when transforming encoder = OrdinalEncoder( @@ -505,18 +523,22 @@ def test_categories_different_order_from_ltype(): "col_4": [2, 0, 1, 3, 0, 1, 2], }, ) - full_categories = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b"], - ] + full_categories = { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "b", "c", "d", "e", "f"], + "col_3": ["a", "b"], + } X = set_first_three_columns_to_ordinal_with_categories( X, categories=full_categories, ) # The order doesn't match the full categories above but outputted data will still match above - categories = [["d", "a", "c", "b"], ["c", "b", "a"], ["b", "a"]] + categories = { + "col_1": ["d", "a", "c", "b"], + "col_2": ["c", "b", "a"], + "col_3": ["b", "a"], + } # test categories value works when transforming encoder = OrdinalEncoder( @@ -564,7 +586,7 @@ def test_ordinal_encoder_categories(): }, ) encoder = OrdinalEncoder( - categories=[["a"], ["a", "b"]], + categories={"col_1": ["a"], "col_2": ["a", "b"]}, handle_unknown="use_encoded_value", unknown_value=-1, )