diff --git a/ImputerExperiments/data/r/.DS_Store b/ImputerExperiments/data/r/.DS_Store new file mode 100644 index 00000000..605eba82 Binary files /dev/null and b/ImputerExperiments/data/r/.DS_Store differ diff --git a/setup.py b/setup.py index b1d194f8..de5e960b 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,6 @@ #TODO update this from setuptools import setup, find_packages - def calculate_version(): initpy = open('tpot2/_version.py').read().split('\n') version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1] diff --git a/tpot2/builtin_modules/__init__.py b/tpot2/builtin_modules/__init__.py index eff7a085..b4b8bac9 100644 --- a/tpot2/builtin_modules/__init__.py +++ b/tpot2/builtin_modules/__init__.py @@ -1,6 +1,6 @@ from .feature_set_selector import FeatureSetSelector from .zero_count import ZeroCount -from .column_one_hot_encoder import ColumnOneHotEncoder +from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder from .arithmetictransformer import ArithmeticTransformer from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer from .passthrough import Passthrough, SkipTransformer diff --git a/tpot2/builtin_modules/column_one_hot_encoder.py b/tpot2/builtin_modules/column_one_hot_encoder.py index 0fdbc810..1854c355 100644 --- a/tpot2/builtin_modules/column_one_hot_encoder.py +++ b/tpot2/builtin_modules/column_one_hot_encoder.py @@ -37,7 +37,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array -from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder import sklearn import pandas as pd @@ -203,3 +203,138 @@ def transform(self, X): return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1) else: return np.hstack((X_not_sel, X_sel)) + +class ColumnOrdinalEncoder(BaseEstimator, TransformerMixin): + + + def __init__(self, columns='auto', handle_unknown='error', unknown_value = -1, encoded_missing_value = np.nan, min_frequency=None,max_categories=None): + ''' + + Parameters + ---------- + + columns : str, list, default='auto' + Determines which columns to onehot encode with sklearn.preprocessing.OneHotEncoder. + - 'auto' : Automatically select categorical features based on columns with less than 10 unique values + - 'categorical' : Automatically select categorical features + - 'numeric' : Automatically select numeric features + - 'all' : Select all features + - list : A list of columns to select + + drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder + + ''' + + self.columns = columns + self.handle_unknown = handle_unknown + self.unknown_value = unknown_value + self.encoded_missing_value = encoded_missing_value + self.min_frequency = min_frequency + self.max_categories = max_categories + + + + def fit(self, X, y=None): + """Fit OneHotEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + y: array-like {n_samples,} (Optional, ignored) + Feature labels + """ + + if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame): + raise ValueError(f"Invalid value for columns: {self.columns}. " + "Only 'all' or is supported for np arrays") + + if self.columns == "categorical": + self.columns_ = list(X.select_dtypes(exclude='number').columns) + elif self.columns == "numeric": + self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])] + elif self.columns == "auto": + self.columns_ = auto_select_categorical_features(X) + elif self.columns == "all": + if isinstance(X, pd.DataFrame): + self.columns_ = X.columns + else: + self.columns_ = list(range(X.shape[1])) + elif isinstance(self.columns, list): + self.columns_ = self.columns + else: + raise ValueError(f"Invalid value for columns: {self.columns}") + + if len(self.columns_) == 0: + return self + + self.enc = sklearn.preprocessing.OrdinalEncoder(categories='auto', + handle_unknown = self.handle_unknown, + unknown_value = self.unknown_value, + encoded_missing_value = self.encoded_missing_value, + min_frequency = self.min_frequency, + max_categories = self.max_categories) + #TODO make this more consistent with sklearn baseimputer/baseencoder + ''' + if isinstance(X, pd.DataFrame): + self.enc.set_output(transform="pandas") + for col in X.columns: + # check if the column name is not a string + if not isinstance(col, str): + # if it's not a string, rename the column with "X" prefix + X.rename(columns={col: f"X{col}"}, inplace=True) + ''' + + if len(self.columns_) == X.shape[1]: + X_sel = self.enc.fit(X) + else: + X_sel, X_not_sel = _X_selected(X, self.columns_) + X_sel = self.enc.fit(X_sel) + + return self + + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + + + if len(self.columns_) == 0: + return X + + #TODO make this more consistent with sklearn baseimputer/baseencoder + ''' + if isinstance(X, pd.DataFrame): + for col in X.columns: + # check if the column name is not a string + if not isinstance(col, str): + # if it's not a string, rename the column with "X" prefix + X.rename(columns={col: f"X{col}"}, inplace=True) + ''' + + if len(self.columns_) == X.shape[1]: + return self.enc.transform(X) + else: + + X_sel, X_not_sel= _X_selected(X, self.columns_) + X_sel = self.enc.transform(X_sel) + + #If X is dataframe + if isinstance(X, pd.DataFrame): + + X_sel = pd.DataFrame(X_sel, columns=self.enc.get_feature_names_out()) + return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1) + else: + return np.hstack((X_not_sel, X_sel)) \ No newline at end of file diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 72eac9a6..8f73c06a 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -60,7 +60,7 @@ from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder -from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer +from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, PassKBinsDiscretizer from tpot2.builtin_modules import Passthrough, SkipTransformer from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor @@ -86,7 +86,7 @@ import sklearn.calibration -all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, +all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, AdaBoostClassifier,MLPRegressor, GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer, @@ -155,19 +155,17 @@ "selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",], "selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"], "selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"], + "classifiers" : ["LGBMClassifier", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'], "regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'XGBRegressor'], - - - "transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], + "transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer", "ColumnOneHotEncoder", "ColumnOrdinalEncoder"], + "scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ], "all_transformers" : ["transformers", "scalers"], - "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], "imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"], "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"], "genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"], - "classifiers_sklearnex" : ["RandomForestClassifier_sklearnex", "LogisticRegression_sklearnex", "KNeighborsClassifier_sklearnex", "SVC_sklearnex","NuSVC_sklearnex"], "regressors_sklearnex" : ["LinearRegression_sklearnex", "Ridge_sklearnex", "Lasso_sklearnex", "ElasticNet_sklearnex", "SVR_sklearnex", "NuSVR_sklearnex", "RandomForestRegressor_sklearnex", "KNeighborsRegressor_sklearnex"], "genetic encoders" : ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"], @@ -352,8 +350,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st return transformers.get_QuantileTransformer_configspace(n_samples=n_samples, random_state=random_state) case "RobustScaler": return transformers.RobustScaler_configspace - case "ColumnOneHotEncoder": - return {} case "MaxAbsScaler": return {} case "PolynomialFeatures": @@ -364,6 +360,10 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state) case "KBinsDiscretizer": return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state) + case "ColumnOneHotEncoder": + return {} + case "ColumnOrdinalEncoder": + return {} #selectors.py case "SelectFwe": diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index 08c9a7aa..bbbd1392 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -62,6 +62,8 @@ OneHotEncoder_configspace = {} #TODO include the parameter for max unique values +OrdinalEncoder_configspace = {} #TODO include the parameter for max unique values + def get_FastICA_configspace(n_features=100, random_state=None): space = {