Make categories arg a dict

alteryx · Oct 4, 2022 · 54386f0 · 54386f0
1 parent b6d9dec
commit 54386f0
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 72 deletions.
diff --git a/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py b/evalml/pipelines/components/transformers/encoders/ordinal_encoder.py
@@ -27,10 +27,9 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta):
     Args:
         features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched.
             If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter.
-        categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories
-            for the column at index `i` in the dataframes passed in at fit and transform.
-            The order of categories specified for a column does not matter.
-            Any category not present in categories will be handled as an unknown value.
+        categories (dict[str, list[str]]): A dictionary mapping column names to their categories
+            in the dataframes passed in at fit and transform. The order of categories specified for a column does not matter.
+            Any category found in the data that is not present in categories will be handled as an unknown value.
             To not have unknown values raise an error, set handle_unknown to "use_encoded_value".
             Defaults to None.
         handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories
@@ -139,12 +138,12 @@ def fit(self, X, y=None):
                     )
 
             # Put features_to_encode in the same relative order as the columns in the dataframe
-            self.features_to_encode = [
-                col for col in X.columns if col in self.features_to_encode
-            ]
+            # self.features_to_encode = [
+            #     col for col in X.columns if col in self.features_to_encode
+            # ]
 
         ww_logical_types = X.ww.logical_types
-        categories = []
+        categories = {}
         if len(self.features_to_encode) == 0:
             # No ordinal features present - no transformation can take place so return early
             return self
@@ -154,44 +153,50 @@ def fit(self, X, y=None):
 
             if len(input_categories) != len(self.features_to_encode):
                 raise ValueError(
-                    "Categories argument must contain as many elements as there are Ordinal features.",
+                    "Categories argument must contain as many elements as there are features to encode.",
                 )
 
-            if not all(isinstance(cats, list) for cats in input_categories):
+            if not all(isinstance(cats, list) for cats in input_categories.values()):
                 raise ValueError(
-                    "Each element of the categories argument must be a list.",
+                    "Each of the values in the categories argument must be a list.",
                 )
             # Categories, as they're passed into SKOrdinalEncoder should be in the same order
             # as the data's Ordinal.order categories even if it's a subset
-            for i, col_categories in enumerate(input_categories):
-                categories_order = ww_logical_types[self.features_to_encode[i]].order
+            for col_name in self.features_to_encode:
+                col_categories = input_categories[col_name]
+                categories_order = ww_logical_types[col_name].order
 
                 ordered_categories = [
                     cat for cat in categories_order if cat in col_categories
                 ]
-                categories.append(ordered_categories)
+                categories[col_name] = ordered_categories
         else:
             # Categories unspecified - use ordered categories from a columns' Ordinal logical type
-            for col in X[self.features_to_encode]:
-                ltype = ww_logical_types[col]
+            for col_name in self.features_to_encode:
+                ltype = ww_logical_types[col_name]
                 # Copy the order list, since we might mutate it later by adding nans
                 # and don't want to impact the Woodwork types
-                categories.append(ltype.order.copy())
+                categories[col_name] = ltype.order.copy()
 
         # Add any null values into the categories lists so that they aren't treated as unknown values
         # This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder
         # requires any null values be present in the categories list if they are to be encoded as
         # missing values
-        for i, col in enumerate(X[self.features_to_encode]):
-            if X[col].isna().any():
-                categories[i] += [np.nan]
+        for col_name in self.features_to_encode:
+            if X[col_name].isna().any():
+                categories[col_name].append(np.nan)
+
+        # sklearn needs categories to be a list in the order of the columns in features_to_encode
+        categories_for_sk_encoder = [
+            categories[col_name] for col_name in self.features_to_encode
+        ]
 
         encoded_missing_value = self.parameters["encoded_missing_value"]
         if encoded_missing_value is None:
             encoded_missing_value = np.nan
 
         self._component_obj = SKOrdinalEncoder(
-            categories=categories,
+            categories=categories_for_sk_encoder,
             handle_unknown=self.parameters["handle_unknown"],
             unknown_value=self.parameters["unknown_value"],
             encoded_missing_value=encoded_missing_value,

diff --git a/evalml/tests/component_tests/test_ordinal_encoder.py b/evalml/tests/component_tests/test_ordinal_encoder.py
@@ -10,9 +10,9 @@
 def set_first_three_columns_to_ordinal_with_categories(X, categories):
     X.ww.init(
         logical_types={
-            "col_1": Ordinal(order=categories[0]),
-            "col_2": Ordinal(order=categories[1]),
-            "col_3": Ordinal(order=categories[2]),
+            "col_1": Ordinal(order=categories["col_1"]),
+            "col_2": Ordinal(order=categories["col_2"]),
+            "col_3": Ordinal(order=categories["col_3"]),
         },
     )
     return X
@@ -64,15 +64,17 @@ def test_invalid_inputs():
     )
     X = set_first_three_columns_to_ordinal_with_categories(
         X,
-        [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]],
+        {"col_1": ["a", "b", "c", "d"], "col_2": ["a", "b", "c"], "col_3": ["a"]},
     )
-    encoder = OrdinalEncoder(categories=[["a", "b"], ["a", "c"]])
-    error_msg = "Categories argument must contain as many elements as there are Ordinal features."
+    encoder = OrdinalEncoder(categories={"col_1": ["a", "b"], "col_2": ["a", "c"]})
+    error_msg = "Categories argument must contain as many elements as there are features to encode."
     with pytest.raises(ValueError, match=error_msg):
         encoder.fit(X)
 
-    encoder = OrdinalEncoder(categories=[["a", "b"], 1, ["a"]])
-    error_msg = "Each element of the categories argument must be a list."
+    encoder = OrdinalEncoder(
+        categories={"col_1": 1, "col_2": ["a", "c"], "col_3": ["a", "b"]},
+    )
+    error_msg = "Each of the values in the categories argument must be a list."
     with pytest.raises(ValueError, match=error_msg):
         encoder.fit(X)
 
@@ -90,7 +92,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
     )
     X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])})
 
-    encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]])
+    encoder = OrdinalEncoder(categories={"col_2": ["a", "b", "c", "d"]})
     encoder.fit(X)
 
     assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode)
@@ -99,8 +101,8 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
     with pytest.raises(ValueError, match=error):
         encoder.categories("col_1")
 
-    # When features_to_encode is passed in, confirm the order there doesn't matter and order of
-    # cols in X is still used in indexing into categories
+    # When features_to_encode is passed in, confirm the order there doesn't matter
+    # in indexing into categories_
     X.ww.init(
         logical_types={
             "col_2": Ordinal(order=["a", "b", "c", "d"]),
@@ -110,15 +112,18 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
     encoder = OrdinalEncoder(
         # features_to_encode passed in different order than the dataframe's cols
         features_to_encode=["col_3", "col_2"],
-        # categories' order still matches the dataframe's cols
-        categories=[["a", "b", "c", "d"], ["x", "y"]],
+        categories={"col_2": ["a", "b", "c", "d"], "col_3": ["x", "y"]},
     )
     encoder.fit(X)
 
     assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode)
     set(encoder.categories("col_2")) == {"a", "b", "c", "d"}
     set(encoder.categories("col_3")) == {"x", "y"}
 
+    X_t = encoder.transform(X)
+    assert list(X_t["col_2_ordinal_encoding"]) == [0, 1, 0, 2, 3]
+    assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 1, 1]
+
 
 def test_features_to_encode_non_ordinal_cols():
     X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]})
@@ -142,7 +147,7 @@ def test_categories_specified_not_present_in_data():
     X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])})
 
     encoder = OrdinalEncoder(
-        categories=[["a", "x"]],
+        categories={"col_1": ["a", "x"]},
         handle_unknown="use_encoded_value",
         unknown_value=-1,
     )
@@ -195,7 +200,11 @@ def test_ordinal_encoder_recognizes_ordinal_columns():
     encoder.fit(X)
     assert not encoder.features_to_encode
 
-    categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]]
+    categories = {
+        "col_1": ["a", "b", "c", "d"],
+        "col_2": ["a", "b", "c"],
+        "col_3": ["a"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)
 
     encoder = OrdinalEncoder()
@@ -207,9 +216,9 @@ def test_ordinal_encoder_recognizes_ordinal_columns():
     encoder.fit(X)
     assert encoder.features_to_encode == ["col_1"]
     assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_)
-    expected_categories = [categories[0]]
+    expected_categories = {"col_1": categories["col_1"]}
     for i, category_list in enumerate(encoder._component_obj.categories_):
-        assert list(category_list) == expected_categories[i]
+        assert list(category_list) == expected_categories[f"col_{i + 1}"]
 
 
 def test_ordinal_encoder_categories_set_correctly_from_fit():
@@ -223,25 +232,29 @@ def test_ordinal_encoder_categories_set_correctly_from_fit():
             "col_4": [1, 2, 3, 4, 5],
         },
     )
-    categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]]
+    categories = {
+        "col_1": ["a", "b", "c", "d"],
+        "col_2": ["a", "b", "c"],
+        "col_3": ["a"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)
 
     # No parameters specified
     encoder = OrdinalEncoder()
     encoder.fit(X)
     for i, category_list in enumerate(encoder._component_obj.categories_):
-        assert list(category_list) == categories[i]
+        assert list(category_list) == categories[f"col_{i + 1}"]
 
     # Categories set at init explicitly - means we have to handle the unknown case
-    subset_categories = [["a"], ["a"], ["a"]]
+    subset_categories = {"col_1": ["a"], "col_2": ["a"], "col_3": ["a"]}
     encoder = OrdinalEncoder(
         categories=subset_categories,
         handle_unknown="use_encoded_value",
         unknown_value=-1,
     )
     encoder.fit(X)
     for i, category_list in enumerate(encoder._component_obj.categories_):
-        assert list(category_list) == subset_categories[i]
+        assert list(category_list) == subset_categories[f"col_{i + 1}"]
 
 
 def test_ordinal_encoder_transform():
@@ -368,7 +381,7 @@ def test_null_values_with_categories_specified():
 
     # Try putting a nan in the categories list in one of the columns but not the other
     encoder = OrdinalEncoder(
-        categories=[["a"], ["a", np.nan]],
+        categories={"col_1": ["a"], "col_2": ["a", np.nan]},
         handle_unknown="use_encoded_value",
         unknown_value=-1,
     )
@@ -389,11 +402,11 @@ def test_handle_unknown():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         },
     )
-    categories = [
-        ["a", "b", "c", "d", "e", "f", "g"],
-        ["a", "b", "c", "d", "e", "f"],
-        ["a", "b"],
-    ]
+    categories = {
+        "col_1": ["a", "b", "c", "d", "e", "f", "g"],
+        "col_2": ["a", "b", "c", "d", "e", "f"],
+        "col_3": ["a", "b"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)
 
     encoder = OrdinalEncoder(handle_unknown="error")
@@ -408,11 +421,11 @@ def test_handle_unknown():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         },
     )
-    categories = [
-        ["x", "b", "c", "d", "e", "f", "g"],
-        ["a", "b", "c", "d", "e", "f"],
-        ["a", "b"],
-    ]
+    categories = {
+        "col_1": ["x", "b", "c", "d", "e", "f", "g"],
+        "col_2": ["a", "b", "c", "d", "e", "f"],
+        "col_3": ["a", "b"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)
     with pytest.raises(ValueError) as exec_info:
         # Using the encoder that was fit on data without x
@@ -429,17 +442,22 @@ def test_categories_set_at_init():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         },
     )
-    full_categories = [
-        ["a", "b", "c", "d", "e", "f", "g"],
-        ["a", "b", "c", "d", "e", "f"],
-        ["a", "b"],
-    ]
+
+    full_categories = {
+        "col_1": ["a", "b", "c", "d", "e", "f", "g"],
+        "col_2": ["a", "b", "c", "d", "e", "f"],
+        "col_3": ["a", "b"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(
         X,
         categories=full_categories,
     )
 
-    categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]
+    categories = {
+        "col_1": ["a", "b", "c", "d"],
+        "col_2": ["a", "b", "c"],
+        "col_3": ["a", "b"],
+    }
 
     # test categories value works when transforming
     encoder = OrdinalEncoder(
@@ -467,18 +485,18 @@ def test_categories_includes_not_present_value():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         },
     )
-    full_categories = [
-        ["a", "b", "c", "d", "e", "f", "g"],
-        ["a", "b", "c", "d", "e", "f"],
-        ["a", "b"],
-    ]
+    full_categories = {
+        "col_1": ["a", "b", "c", "d", "e", "f", "g"],
+        "col_2": ["a", "b", "c", "d", "e", "f"],
+        "col_3": ["a", "b"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(
         X,
         categories=full_categories,
     )
 
     # Categories passed in has value "x" that's not in the data
-    categories = [["a", "x"], ["a", "x"], ["a", "x"]]
+    categories = {"col_1": ["a", "x"], "col_2": ["a", "x"], "col_3": ["a", "x"]}
 
     # test categories value works when transforming
     encoder = OrdinalEncoder(
@@ -505,18 +523,22 @@ def test_categories_different_order_from_ltype():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         },
     )
-    full_categories = [
-        ["a", "b", "c", "d", "e", "f", "g"],
-        ["a", "b", "c", "d", "e", "f"],
-        ["a", "b"],
-    ]
+    full_categories = {
+        "col_1": ["a", "b", "c", "d", "e", "f", "g"],
+        "col_2": ["a", "b", "c", "d", "e", "f"],
+        "col_3": ["a", "b"],
+    }
     X = set_first_three_columns_to_ordinal_with_categories(
         X,
         categories=full_categories,
     )
 
     # The order doesn't match the full categories above but outputted data will still match above
-    categories = [["d", "a", "c", "b"], ["c", "b", "a"], ["b", "a"]]
+    categories = {
+        "col_1": ["d", "a", "c", "b"],
+        "col_2": ["c", "b", "a"],
+        "col_3": ["b", "a"],
+    }
 
     # test categories value works when transforming
     encoder = OrdinalEncoder(
@@ -564,7 +586,7 @@ def test_ordinal_encoder_categories():
         },
     )
     encoder = OrdinalEncoder(
-        categories=[["a"], ["a", "b"]],
+        categories={"col_1": ["a"], "col_2": ["a", "b"]},
         handle_unknown="use_encoded_value",
         unknown_value=-1,
     )