Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding SKLearn OrdinalEncoder as a Transformer #157

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Binary file added ImputerExperiments/data/r/.DS_Store
Binary file not shown.
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#TODO update this
from setuptools import setup, find_packages


def calculate_version():
initpy = open('tpot2/_version.py').read().split('\n')
version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1]
Expand Down
2 changes: 1 addition & 1 deletion tpot2/builtin_modules/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .feature_set_selector import FeatureSetSelector
from .zero_count import ZeroCount
from .column_one_hot_encoder import ColumnOneHotEncoder
from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder
from .arithmetictransformer import ArithmeticTransformer
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from .passthrough import Passthrough, SkipTransformer
Expand Down
137 changes: 136 additions & 1 deletion tpot2/builtin_modules/column_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import sklearn

import pandas as pd
Expand Down Expand Up @@ -203,3 +203,138 @@ def transform(self, X):
return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1)
else:
return np.hstack((X_not_sel, X_sel))

class ColumnOrdinalEncoder(BaseEstimator, TransformerMixin):


def __init__(self, columns='auto', handle_unknown='error', unknown_value = -1, encoded_missing_value = np.nan, min_frequency=None,max_categories=None):
'''

Parameters
----------

columns : str, list, default='auto'
Determines which columns to onehot encode with sklearn.preprocessing.OneHotEncoder.
- 'auto' : Automatically select categorical features based on columns with less than 10 unique values
- 'categorical' : Automatically select categorical features
- 'numeric' : Automatically select numeric features
- 'all' : Select all features
- list : A list of columns to select

drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder

'''

self.columns = columns
self.handle_unknown = handle_unknown
self.unknown_value = unknown_value
self.encoded_missing_value = encoded_missing_value
self.min_frequency = min_frequency
self.max_categories = max_categories



def fit(self, X, y=None):
"""Fit OneHotEncoder to X, then transform X.

Equivalent to self.fit(X).transform(X), but more convenient and more
efficient. See fit for the parameters, transform for the return value.

Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
Dense array or sparse matrix.
y: array-like {n_samples,} (Optional, ignored)
Feature labels
"""

if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame):
raise ValueError(f"Invalid value for columns: {self.columns}. "
"Only 'all' or <list> is supported for np arrays")

if self.columns == "categorical":
self.columns_ = list(X.select_dtypes(exclude='number').columns)
elif self.columns == "numeric":
self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
elif self.columns == "auto":
self.columns_ = auto_select_categorical_features(X)
elif self.columns == "all":
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns
else:
self.columns_ = list(range(X.shape[1]))
elif isinstance(self.columns, list):
self.columns_ = self.columns
else:
raise ValueError(f"Invalid value for columns: {self.columns}")

if len(self.columns_) == 0:
return self

self.enc = sklearn.preprocessing.OrdinalEncoder(categories='auto',
handle_unknown = self.handle_unknown,
unknown_value = self.unknown_value,
encoded_missing_value = self.encoded_missing_value,
min_frequency = self.min_frequency,
max_categories = self.max_categories)
#TODO make this more consistent with sklearn baseimputer/baseencoder
'''
if isinstance(X, pd.DataFrame):
self.enc.set_output(transform="pandas")
for col in X.columns:
# check if the column name is not a string
if not isinstance(col, str):
# if it's not a string, rename the column with "X" prefix
X.rename(columns={col: f"X{col}"}, inplace=True)
'''

if len(self.columns_) == X.shape[1]:
X_sel = self.enc.fit(X)
else:
X_sel, X_not_sel = _X_selected(X, self.columns_)
X_sel = self.enc.fit(X_sel)

return self

def transform(self, X):
"""Transform X using one-hot encoding.

Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
Dense array or sparse matrix.

Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
Transformed input.
"""


if len(self.columns_) == 0:
return X

#TODO make this more consistent with sklearn baseimputer/baseencoder
'''
if isinstance(X, pd.DataFrame):
for col in X.columns:
# check if the column name is not a string
if not isinstance(col, str):
# if it's not a string, rename the column with "X" prefix
X.rename(columns={col: f"X{col}"}, inplace=True)
'''

if len(self.columns_) == X.shape[1]:
return self.enc.transform(X)
else:

X_sel, X_not_sel= _X_selected(X, self.columns_)
X_sel = self.enc.transform(X_sel)

#If X is dataframe
if isinstance(X, pd.DataFrame):

X_sel = pd.DataFrame(X_sel, columns=self.enc.get_feature_names_out())
return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1)
else:
return np.hstack((X_not_sel, X_sel))
18 changes: 9 additions & 9 deletions tpot2/config/get_configspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector
from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, PassKBinsDiscretizer
from tpot2.builtin_modules import Passthrough, SkipTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
Expand All @@ -86,7 +86,7 @@
import sklearn.calibration


all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
AdaBoostClassifier,MLPRegressor,
GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor,
AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer,
Expand Down Expand Up @@ -155,19 +155,17 @@
"selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",],
"selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"],
"selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"],

"classifiers" : ["LGBMClassifier", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
"regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'XGBRegressor'],


"transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
"transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer", "ColumnOneHotEncoder", "ColumnOrdinalEncoder"],

"scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ],
"all_transformers" : ["transformers", "scalers"],

"arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"],
"imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"],
"skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"],
"genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],

"classifiers_sklearnex" : ["RandomForestClassifier_sklearnex", "LogisticRegression_sklearnex", "KNeighborsClassifier_sklearnex", "SVC_sklearnex","NuSVC_sklearnex"],
"regressors_sklearnex" : ["LinearRegression_sklearnex", "Ridge_sklearnex", "Lasso_sklearnex", "ElasticNet_sklearnex", "SVR_sklearnex", "NuSVR_sklearnex", "RandomForestRegressor_sklearnex", "KNeighborsRegressor_sklearnex"],
"genetic encoders" : ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
Expand Down Expand Up @@ -352,8 +350,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
return transformers.get_QuantileTransformer_configspace(n_samples=n_samples, random_state=random_state)
case "RobustScaler":
return transformers.RobustScaler_configspace
case "ColumnOneHotEncoder":
return {}
case "MaxAbsScaler":
return {}
case "PolynomialFeatures":
Expand All @@ -364,6 +360,10 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
case "KBinsDiscretizer":
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
case "ColumnOneHotEncoder":
return {}
case "ColumnOrdinalEncoder":
return {}

#selectors.py
case "SelectFwe":
Expand Down
2 changes: 2 additions & 0 deletions tpot2/config/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@

OneHotEncoder_configspace = {} #TODO include the parameter for max unique values

OrdinalEncoder_configspace = {} #TODO include the parameter for max unique values

def get_FastICA_configspace(n_features=100, random_state=None):

space = {
Expand Down
Loading