Skip to content

Commit

Permalink
Make categories arg a dict
Browse files Browse the repository at this point in the history
  • Loading branch information
Tamar Grey committed Oct 4, 2022
1 parent b6d9dec commit 54386f0
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta):
Args:
features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched.
If None, all appropriate columns will be encoded. Defaults to None. The order of columns does not matter.
categories (list[list[str]]): A two dimensional list of categories, where `categories[i]` is a list of the categories
for the column at index `i` in the dataframes passed in at fit and transform.
The order of categories specified for a column does not matter.
Any category not present in categories will be handled as an unknown value.
categories (dict[str, list[str]]): A dictionary mapping column names to their categories
in the dataframes passed in at fit and transform. The order of categories specified for a column does not matter.
Any category found in the data that is not present in categories will be handled as an unknown value.
To not have unknown values raise an error, set handle_unknown to "use_encoded_value".
Defaults to None.
handle_unknown ("error" or "use_encoded_value"): Whether to ignore or error for unknown categories
Expand Down Expand Up @@ -139,12 +138,12 @@ def fit(self, X, y=None):
)

# Put features_to_encode in the same relative order as the columns in the dataframe
self.features_to_encode = [
col for col in X.columns if col in self.features_to_encode
]
# self.features_to_encode = [
# col for col in X.columns if col in self.features_to_encode
# ]

ww_logical_types = X.ww.logical_types
categories = []
categories = {}
if len(self.features_to_encode) == 0:
# No ordinal features present - no transformation can take place so return early
return self
Expand All @@ -154,44 +153,50 @@ def fit(self, X, y=None):

if len(input_categories) != len(self.features_to_encode):
raise ValueError(
"Categories argument must contain as many elements as there are Ordinal features.",
"Categories argument must contain as many elements as there are features to encode.",
)

if not all(isinstance(cats, list) for cats in input_categories):
if not all(isinstance(cats, list) for cats in input_categories.values()):
raise ValueError(
"Each element of the categories argument must be a list.",
"Each of the values in the categories argument must be a list.",
)
# Categories, as they're passed into SKOrdinalEncoder should be in the same order
# as the data's Ordinal.order categories even if it's a subset
for i, col_categories in enumerate(input_categories):
categories_order = ww_logical_types[self.features_to_encode[i]].order
for col_name in self.features_to_encode:
col_categories = input_categories[col_name]
categories_order = ww_logical_types[col_name].order

ordered_categories = [
cat for cat in categories_order if cat in col_categories
]
categories.append(ordered_categories)
categories[col_name] = ordered_categories
else:
# Categories unspecified - use ordered categories from a columns' Ordinal logical type
for col in X[self.features_to_encode]:
ltype = ww_logical_types[col]
for col_name in self.features_to_encode:
ltype = ww_logical_types[col_name]
# Copy the order list, since we might mutate it later by adding nans
# and don't want to impact the Woodwork types
categories.append(ltype.order.copy())
categories[col_name] = ltype.order.copy()

# Add any null values into the categories lists so that they aren't treated as unknown values
# This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder
# requires any null values be present in the categories list if they are to be encoded as
# missing values
for i, col in enumerate(X[self.features_to_encode]):
if X[col].isna().any():
categories[i] += [np.nan]
for col_name in self.features_to_encode:
if X[col_name].isna().any():
categories[col_name].append(np.nan)

# sklearn needs categories to be a list in the order of the columns in features_to_encode
categories_for_sk_encoder = [
categories[col_name] for col_name in self.features_to_encode
]

encoded_missing_value = self.parameters["encoded_missing_value"]
if encoded_missing_value is None:
encoded_missing_value = np.nan

self._component_obj = SKOrdinalEncoder(
categories=categories,
categories=categories_for_sk_encoder,
handle_unknown=self.parameters["handle_unknown"],
unknown_value=self.parameters["unknown_value"],
encoded_missing_value=encoded_missing_value,
Expand Down
124 changes: 73 additions & 51 deletions evalml/tests/component_tests/test_ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
def set_first_three_columns_to_ordinal_with_categories(X, categories):
X.ww.init(
logical_types={
"col_1": Ordinal(order=categories[0]),
"col_2": Ordinal(order=categories[1]),
"col_3": Ordinal(order=categories[2]),
"col_1": Ordinal(order=categories["col_1"]),
"col_2": Ordinal(order=categories["col_2"]),
"col_3": Ordinal(order=categories["col_3"]),
},
)
return X
Expand Down Expand Up @@ -64,15 +64,17 @@ def test_invalid_inputs():
)
X = set_first_three_columns_to_ordinal_with_categories(
X,
[["a", "b", "c", "d"], ["a", "b", "c"], ["a"]],
{"col_1": ["a", "b", "c", "d"], "col_2": ["a", "b", "c"], "col_3": ["a"]},
)
encoder = OrdinalEncoder(categories=[["a", "b"], ["a", "c"]])
error_msg = "Categories argument must contain as many elements as there are Ordinal features."
encoder = OrdinalEncoder(categories={"col_1": ["a", "b"], "col_2": ["a", "c"]})
error_msg = "Categories argument must contain as many elements as there are features to encode."
with pytest.raises(ValueError, match=error_msg):
encoder.fit(X)

encoder = OrdinalEncoder(categories=[["a", "b"], 1, ["a"]])
error_msg = "Each element of the categories argument must be a list."
encoder = OrdinalEncoder(
categories={"col_1": 1, "col_2": ["a", "c"], "col_3": ["a", "b"]},
)
error_msg = "Each of the values in the categories argument must be a list."
with pytest.raises(ValueError, match=error_msg):
encoder.fit(X)

Expand All @@ -90,7 +92,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
)
X.ww.init(logical_types={"col_2": Ordinal(order=["a", "b", "c", "d"])})

encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]])
encoder = OrdinalEncoder(categories={"col_2": ["a", "b", "c", "d"]})
encoder.fit(X)

assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode)
Expand All @@ -99,8 +101,8 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
with pytest.raises(ValueError, match=error):
encoder.categories("col_1")

# When features_to_encode is passed in, confirm the order there doesn't matter and order of
# cols in X is still used in indexing into categories
# When features_to_encode is passed in, confirm the order there doesn't matter
# in indexing into categories_
X.ww.init(
logical_types={
"col_2": Ordinal(order=["a", "b", "c", "d"]),
Expand All @@ -110,15 +112,18 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
encoder = OrdinalEncoder(
# features_to_encode passed in different order than the dataframe's cols
features_to_encode=["col_3", "col_2"],
# categories' order still matches the dataframe's cols
categories=[["a", "b", "c", "d"], ["x", "y"]],
categories={"col_2": ["a", "b", "c", "d"], "col_3": ["x", "y"]},
)
encoder.fit(X)

assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode)
set(encoder.categories("col_2")) == {"a", "b", "c", "d"}
set(encoder.categories("col_3")) == {"x", "y"}

X_t = encoder.transform(X)
assert list(X_t["col_2_ordinal_encoding"]) == [0, 1, 0, 2, 3]
assert list(X_t["col_3_ordinal_encoding"]) == [0, 0, 0, 1, 1]


def test_features_to_encode_non_ordinal_cols():
X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]})
Expand All @@ -142,7 +147,7 @@ def test_categories_specified_not_present_in_data():
X.ww.init(logical_types={"col_1": Ordinal(order=["a", "b", "c", "d", "x"])})

encoder = OrdinalEncoder(
categories=[["a", "x"]],
categories={"col_1": ["a", "x"]},
handle_unknown="use_encoded_value",
unknown_value=-1,
)
Expand Down Expand Up @@ -195,7 +200,11 @@ def test_ordinal_encoder_recognizes_ordinal_columns():
encoder.fit(X)
assert not encoder.features_to_encode

categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]]
categories = {
"col_1": ["a", "b", "c", "d"],
"col_2": ["a", "b", "c"],
"col_3": ["a"],
}
X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)

encoder = OrdinalEncoder()
Expand All @@ -207,9 +216,9 @@ def test_ordinal_encoder_recognizes_ordinal_columns():
encoder.fit(X)
assert encoder.features_to_encode == ["col_1"]
assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_)
expected_categories = [categories[0]]
expected_categories = {"col_1": categories["col_1"]}
for i, category_list in enumerate(encoder._component_obj.categories_):
assert list(category_list) == expected_categories[i]
assert list(category_list) == expected_categories[f"col_{i + 1}"]


def test_ordinal_encoder_categories_set_correctly_from_fit():
Expand All @@ -223,25 +232,29 @@ def test_ordinal_encoder_categories_set_correctly_from_fit():
"col_4": [1, 2, 3, 4, 5],
},
)
categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a"]]
categories = {
"col_1": ["a", "b", "c", "d"],
"col_2": ["a", "b", "c"],
"col_3": ["a"],
}
X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)

# No parameters specified
encoder = OrdinalEncoder()
encoder.fit(X)
for i, category_list in enumerate(encoder._component_obj.categories_):
assert list(category_list) == categories[i]
assert list(category_list) == categories[f"col_{i + 1}"]

# Categories set at init explicitly - means we have to handle the unknown case
subset_categories = [["a"], ["a"], ["a"]]
subset_categories = {"col_1": ["a"], "col_2": ["a"], "col_3": ["a"]}
encoder = OrdinalEncoder(
categories=subset_categories,
handle_unknown="use_encoded_value",
unknown_value=-1,
)
encoder.fit(X)
for i, category_list in enumerate(encoder._component_obj.categories_):
assert list(category_list) == subset_categories[i]
assert list(category_list) == subset_categories[f"col_{i + 1}"]


def test_ordinal_encoder_transform():
Expand Down Expand Up @@ -368,7 +381,7 @@ def test_null_values_with_categories_specified():

# Try putting a nan in the categories list in one of the columns but not the other
encoder = OrdinalEncoder(
categories=[["a"], ["a", np.nan]],
categories={"col_1": ["a"], "col_2": ["a", np.nan]},
handle_unknown="use_encoded_value",
unknown_value=-1,
)
Expand All @@ -389,11 +402,11 @@ def test_handle_unknown():
"col_4": [2, 0, 1, 3, 0, 1, 2],
},
)
categories = [
["a", "b", "c", "d", "e", "f", "g"],
["a", "b", "c", "d", "e", "f"],
["a", "b"],
]
categories = {
"col_1": ["a", "b", "c", "d", "e", "f", "g"],
"col_2": ["a", "b", "c", "d", "e", "f"],
"col_3": ["a", "b"],
}
X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)

encoder = OrdinalEncoder(handle_unknown="error")
Expand All @@ -408,11 +421,11 @@ def test_handle_unknown():
"col_4": [2, 0, 1, 3, 0, 1, 2],
},
)
categories = [
["x", "b", "c", "d", "e", "f", "g"],
["a", "b", "c", "d", "e", "f"],
["a", "b"],
]
categories = {
"col_1": ["x", "b", "c", "d", "e", "f", "g"],
"col_2": ["a", "b", "c", "d", "e", "f"],
"col_3": ["a", "b"],
}
X = set_first_three_columns_to_ordinal_with_categories(X, categories=categories)
with pytest.raises(ValueError) as exec_info:
# Using the encoder that was fit on data without x
Expand All @@ -429,17 +442,22 @@ def test_categories_set_at_init():
"col_4": [2, 0, 1, 3, 0, 1, 2],
},
)
full_categories = [
["a", "b", "c", "d", "e", "f", "g"],
["a", "b", "c", "d", "e", "f"],
["a", "b"],
]

full_categories = {
"col_1": ["a", "b", "c", "d", "e", "f", "g"],
"col_2": ["a", "b", "c", "d", "e", "f"],
"col_3": ["a", "b"],
}
X = set_first_three_columns_to_ordinal_with_categories(
X,
categories=full_categories,
)

categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]
categories = {
"col_1": ["a", "b", "c", "d"],
"col_2": ["a", "b", "c"],
"col_3": ["a", "b"],
}

# test categories value works when transforming
encoder = OrdinalEncoder(
Expand Down Expand Up @@ -467,18 +485,18 @@ def test_categories_includes_not_present_value():
"col_4": [2, 0, 1, 3, 0, 1, 2],
},
)
full_categories = [
["a", "b", "c", "d", "e", "f", "g"],
["a", "b", "c", "d", "e", "f"],
["a", "b"],
]
full_categories = {
"col_1": ["a", "b", "c", "d", "e", "f", "g"],
"col_2": ["a", "b", "c", "d", "e", "f"],
"col_3": ["a", "b"],
}
X = set_first_three_columns_to_ordinal_with_categories(
X,
categories=full_categories,
)

# Categories passed in has value "x" that's not in the data
categories = [["a", "x"], ["a", "x"], ["a", "x"]]
categories = {"col_1": ["a", "x"], "col_2": ["a", "x"], "col_3": ["a", "x"]}

# test categories value works when transforming
encoder = OrdinalEncoder(
Expand All @@ -505,18 +523,22 @@ def test_categories_different_order_from_ltype():
"col_4": [2, 0, 1, 3, 0, 1, 2],
},
)
full_categories = [
["a", "b", "c", "d", "e", "f", "g"],
["a", "b", "c", "d", "e", "f"],
["a", "b"],
]
full_categories = {
"col_1": ["a", "b", "c", "d", "e", "f", "g"],
"col_2": ["a", "b", "c", "d", "e", "f"],
"col_3": ["a", "b"],
}
X = set_first_three_columns_to_ordinal_with_categories(
X,
categories=full_categories,
)

# The order doesn't match the full categories above but outputted data will still match above
categories = [["d", "a", "c", "b"], ["c", "b", "a"], ["b", "a"]]
categories = {
"col_1": ["d", "a", "c", "b"],
"col_2": ["c", "b", "a"],
"col_3": ["b", "a"],
}

# test categories value works when transforming
encoder = OrdinalEncoder(
Expand Down Expand Up @@ -564,7 +586,7 @@ def test_ordinal_encoder_categories():
},
)
encoder = OrdinalEncoder(
categories=[["a"], ["a", "b"]],
categories={"col_1": ["a"], "col_2": ["a", "b"]},
handle_unknown="use_encoded_value",
unknown_value=-1,
)
Expand Down

0 comments on commit 54386f0

Please sign in to comment.