Skip to content

Commit

Permalink
MR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Tamar Grey committed Oct 4, 2022
1 parent f977005 commit 72a9dbf
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class OrdinalEncoder(Transformer, metaclass=OrdinalEncoderMeta):
encoded_missing_value (int or np.nan): The value to use for missing (null) values seen during
fit or transform. Defaults to np.nan.
random_seed (int): Seed for the random number generator. Defaults to 0.
# --> is random seed still necessary?
"""

name = "Ordinal Encoder"
Expand All @@ -60,7 +59,7 @@ def __init__(
unknown_value=None,
encoded_missing_value=None,
random_seed=0,
**kwargs, # --> why do we have kwargs if they arent used?? Maybe needed for larger evalml pipeline
**kwargs,
):
parameters = {
"features_to_encode": features_to_encode,
Expand All @@ -85,7 +84,7 @@ def __init__(
)

self.features_to_encode = features_to_encode
self._encoder = None
self._component_obj = None

super().__init__(
parameters=parameters,
Expand Down Expand Up @@ -144,8 +143,6 @@ def fit(self, X, y=None):
col for col in X.columns if col in self.features_to_encode
]

X_t = X

ww_logical_types = X.ww.logical_types
categories = []
if len(self.features_to_encode) == 0:
Expand Down Expand Up @@ -175,7 +172,7 @@ def fit(self, X, y=None):
categories.append(ordered_categories)
else:
# Categories unspecified - use ordered categories from a columns' Ordinal logical type
for col in X_t[self.features_to_encode]:
for col in X[self.features_to_encode]:
ltype = ww_logical_types[col]
# Copy the order list, since we might mutate it later by adding nans
# and don't want to impact the Woodwork types
Expand All @@ -185,22 +182,22 @@ def fit(self, X, y=None):
# This is needed because Ordinal.order won't indicate if nulls are present, and SKOrdinalEncoder
# requires any null values be present in the categories list if they are to be encoded as
# missing values
for i, col in enumerate(X_t[self.features_to_encode]):
if X_t[col].isna().any():
for i, col in enumerate(X[self.features_to_encode]):
if X[col].isna().any():
categories[i] += [np.nan]

encoded_missing_value = self.parameters["encoded_missing_value"]
if encoded_missing_value is None:
encoded_missing_value = np.nan

self._encoder = SKOrdinalEncoder(
self._component_obj = SKOrdinalEncoder(
categories=categories,
handle_unknown=self.parameters["handle_unknown"],
unknown_value=self.parameters["unknown_value"],
encoded_missing_value=encoded_missing_value,
)

self._encoder.fit(X_t[self.features_to_encode])
self._component_obj.fit(X[self.features_to_encode])
return self

def transform(self, X, y=None):
Expand All @@ -218,14 +215,14 @@ def transform(self, X, y=None):
X = infer_feature_types(X)

if not self.features_to_encode:
# If there are no features to encode, X needs no transformation, so return a copy
return X.ww.copy()
# If there are no features to encode, X needs no transformation
return X

X_orig = X.ww.drop(columns=self.features_to_encode)

# Call sklearn's transform on only the ordinal columns
X_t = pd.DataFrame(
self._encoder.transform(X[self.features_to_encode]),
self._component_obj.transform(X[self.features_to_encode]),
index=X.index,
)
X_t.columns = self._get_feature_names()
Expand Down Expand Up @@ -273,7 +270,7 @@ def categories(self, feature_name):
raise ValueError(
f'Feature "{feature_name}" was not provided to ordinal encoder as a training feature',
)
return self._encoder.categories_[index]
return self._component_obj.categories_[index]

def get_feature_names(self):
"""Return feature names for the ordinal features after fitting.
Expand Down
14 changes: 7 additions & 7 deletions evalml/tests/component_tests/test_ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
encoder = OrdinalEncoder(categories=[["a", "b", "c", "d"]])
encoder.fit(X)

assert len(encoder._encoder.categories_) == len(encoder.features_to_encode)
assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode)

error = 'Feature "col_1" was not provided to ordinal encoder as a training feature'
with pytest.raises(ValueError, match=error):
Expand All @@ -115,7 +115,7 @@ def test_categories_list_not_passed_in_for_non_ordinal_column():
)
encoder.fit(X)

assert len(encoder._encoder.categories_) == len(encoder.features_to_encode)
assert len(encoder._component_obj.categories_) == len(encoder.features_to_encode)
set(encoder.categories("col_2")) == {"a", "b", "c", "d"}
set(encoder.categories("col_3")) == {"x", "y"}

Expand Down Expand Up @@ -201,14 +201,14 @@ def test_ordinal_encoder_recognizes_ordinal_columns():
encoder = OrdinalEncoder()
encoder.fit(X)
assert encoder.features_to_encode == ["col_1", "col_2", "col_3"]
assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_)
assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_)

encoder = OrdinalEncoder(features_to_encode=["col_1"])
encoder.fit(X)
assert encoder.features_to_encode == ["col_1"]
assert encoder.features_to_encode == list(encoder._encoder.feature_names_in_)
assert encoder.features_to_encode == list(encoder._component_obj.feature_names_in_)
expected_categories = [categories[0]]
for i, category_list in enumerate(encoder._encoder.categories_):
for i, category_list in enumerate(encoder._component_obj.categories_):
assert list(category_list) == expected_categories[i]


Expand All @@ -229,7 +229,7 @@ def test_ordinal_encoder_categories_set_correctly_from_fit():
# No parameters specified
encoder = OrdinalEncoder()
encoder.fit(X)
for i, category_list in enumerate(encoder._encoder.categories_):
for i, category_list in enumerate(encoder._component_obj.categories_):
assert list(category_list) == categories[i]

# Categories set at init explicitly - means we have to handle the unknown case
Expand All @@ -240,7 +240,7 @@ def test_ordinal_encoder_categories_set_correctly_from_fit():
unknown_value=-1,
)
encoder.fit(X)
for i, category_list in enumerate(encoder._encoder.categories_):
for i, category_list in enumerate(encoder._component_obj.categories_):
assert list(category_list) == subset_categories[i]


Expand Down

0 comments on commit 72a9dbf

Please sign in to comment.