diff --git a/tpot/base.py b/tpot/base.py index 626bb247..64caaeae 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -65,7 +65,7 @@ from .operator_utils import TPOTOperatorClassFactory, Operator, ARGType from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code from .decorators import _pre_test -from .builtins import CombineDFs, StackingEstimator +from .builtins import CombineDFs, StackingEstimator, IdentityTransformer from .config.classifier_light import classifier_config_dict_light from .config.regressor_light import regressor_config_dict_light @@ -110,8 +110,8 @@ class TPOTBase(BaseEstimator): def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, - max_time_mins=None, max_eval_time_mins=5, - random_state=None, config_dict=None, template=None, + max_time_mins=None, max_eval_time_mins=5, random_state=None, + config_dict=None, preprocess_config_dict=None, template=None, warm_start=False, memory=None, use_dask=False, periodic_checkpoint_folder=None, early_stop=None, verbosity=0, disable_update_check=False): @@ -210,6 +210,10 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. + preprocess_config_dict: a Python dictionary, optional (default: None) + Python dictionary: + A dictionary containing metadata on features for mandatory preprocessing. + Examples coming soon. template: string (default: None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports @@ -286,6 +290,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.periodic_checkpoint_folder = periodic_checkpoint_folder self.early_stop = early_stop self.config_dict = config_dict + self.preprocess_config_dict = preprocess_config_dict self.template = template self.warm_start = warm_start self.memory = memory @@ -391,6 +396,55 @@ def _setup_config(self, config_dict): else: self._config_dict = self.default_config_dict + def _setup_preprocess_config(self, config_dict): + if self.template is None: + self.template = 'RandomTree' + elif 'PreprocessTransformer' in self.template: + return + if config_dict: + # check for valid keys... + preprocess_keys = ['text_columns', 'numeric_columns', 'categorical_columns', 'impute'] + column_transform_keys = ['text_columns', 'numeric_columns', 'categorical_columns', 'text_transformer', 'categorical_transformer'] + check_keys_ = [] + column_transform_ = [] + for k in config_dict.keys(): + if k not in preprocess_keys: + check_keys_.append(k) + if k in column_transform_keys: + column_transform_.append(k) + + if len(check_keys_) > 0: + raise Exception( + 'It appears preprocessing configuration contains invalid keys: {}.' + 'Valid keys are text_columns, numeric_columns,' + 'categorical_columns, impute.'.format(', '.join(check_keys_)) + ) + + # override some settings... + if config_dict.get('impute', None) is False: + self._fitted_imputer = IdentityTransformer() + + if len(column_transform_) > 0: + column_transform_dict = {} + for k in ['text_columns', 'numeric_columns', 'categorical_columns']: + if config_dict.get(k) is not None: + column_transform_dict[k] = [config_dict[k]] + for k in ['text_transformer', 'categorical_transformer']: + if k in config_dict: + # force transformers to be a list + if config_dict[k] is list: + column_transform_dict[k] = config_dict[k] + else: + column_transform_dict[k] = [config_dict[k]] + self._config_dict['tpot.builtins.PreprocessTransformer'] = column_transform_dict + self.config_dict = copy(self._config_dict) + if self.template is None: + self.template = "PreprocessTransformer-RandomTree" + else: + self.template = "{}-{}".format('PreprocessTransformer', self.template) + else: + self._preprocess_config_dict = {} + def _read_config_file(self, config_path): if os.path.isfile(config_path): @@ -427,11 +481,11 @@ def _setup_pset(self): def _add_operators(self): - main_type = ["Classifier", "Regressor", "Selector", "Transformer"] - ret_types = [] - self.op_list = [] - if self.template == None: # default pipeline structure - step_in_type = np.ndarray + def add_randomtree(ret_types=[]): + if len(ret_types) > 0: + step_in_type = ret_types[-1] + else: + step_in_type = np.ndarray step_ret_type = Output_Array for operator in self.operators: arg_types = operator.parameter_types()[0][1:] @@ -445,6 +499,12 @@ def _add_operators(self): self._pset.addPrimitive(operator, *tree_p_types) self._import_hash_and_add_terminals(operator, arg_types) self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + + main_type = ["Classifier", "Regressor", "Selector", "Transformer"] + ret_types = [] + self.op_list = [] + if self.template == None or self.template == 'RandomTree': # default pipeline structure + add_randomtree() else: gp_types = {} for idx, step in enumerate(self._template_comp): @@ -464,6 +524,8 @@ def _add_operators(self): step_ret_type = Output_Array if step == 'CombineDFs': self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + elif step == 'RandomTree': + add_randomtree(ret_types) elif main_type.count(step): # if the step is a main type for operator in self.operators: arg_types = operator.parameter_types()[0][1:] @@ -555,6 +617,8 @@ def _fit_init(self): self._setup_config(self.config_dict) + self._setup_preprocess_config(self.preprocess_config_dict) + self._setup_template(self.template) self.operators = [] diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py index fe6c7f50..7c907680 100644 --- a/tpot/builtins/__init__.py +++ b/tpot/builtins/__init__.py @@ -29,3 +29,4 @@ from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected from .feature_transformers import CategoricalSelector, ContinuousSelector from .feature_set_selector import FeatureSetSelector +from .preprocessing import IdentityTransformer, PreprocessTransformer diff --git a/tpot/builtins/preprocessing.py b/tpot/builtins/preprocessing.py new file mode 100644 index 00000000..25689d74 --- /dev/null +++ b/tpot/builtins/preprocessing.py @@ -0,0 +1,93 @@ + +import sys +import os +from importlib import import_module + +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer +from sklearn.preprocessing import OneHotEncoder +from importlib import import_module +from sklearn.compose import ColumnTransformer + +import_loader = { + 'TfidfVectorizer': TfidfVectorizer, + 'CountVectorizer': CountVectorizer, + 'HashingVectorizer': HashingVectorizer, + 'OneHotEncoder': OneHotEncoder +} + + + +def load_scoring_function(scoring_func): + """ + converts mymodule.myfunc in the myfunc + object itself so tpot receives a scoring function + """ + if scoring_func and ("." in scoring_func): + try: + module_name, func_name = scoring_func.rsplit('.', 1) + + module_path = os.getcwd() + sys.path.insert(0, module_path) + scoring_func = getattr(import_module(module_name), func_name) + sys.path.pop(0) + + print('manual scoring function: {}'.format(scoring_func)) + print('taken from module: {}'.format(module_name)) + except Exception as e: + print('failed importing custom scoring function, error: {}'.format(str(e))) + raise ValueError(e) + + return scoring_func + +class IdentityTransformer(TransformerMixin, BaseEstimator): + """Identity-transformer for doing literally nothing""" + def fit(self, X, y=None, **fit_params): + return self + def transform(self, X): + return X + +class PreprocessTransformer(TransformerMixin): + def __init__(self, numeric_columns=[], categorical_columns=[], text_columns=[], + text_transformer = 'TfidfVectorizer', categorical_transformer = 'OneHotEncoder'): + self.numeric_columns = numeric_columns + self.categorical_columns = categorical_columns + self.text_columns = text_columns + self.text_transformer = text_transformer + self.categorical_transformer = categorical_transformer + + def _setup_columns(self): + def text_to_list(text): + if type(text) is list: + return text + return [text] + + self.numeric_columns = text_to_list(self.numeric_columns) + self.categorical_columns = text_to_list(self.categorical_columns) + self.text_columns = text_to_list(self.text_columns) + + column_list = [] + if len(self.text_columns) > 0: + load_func = import_loader.get(self.text_transformer, self.text_transformer) + if isinstance(load_func, str): + load_func = load_scoring_function(load_func) + for idx, text in enumerate(self.text_columns): + column_list.append(('text' + str(idx), load_func(), text)) + + if len(self.numeric_columns) > 0: + column_list.append(('numeric', IdentityTransformer(), self.numeric_columns)) + + if len(self.categorical_columns) > 0: + load_func = import_loader.get(self.categorical_transformer, self.categorical_transformer) + if isinstance(load_func, str): + load_func = load_scoring_function(load_func) + column_list.append(('categorical', load_func(), self.categorical_columns)) + + self.column_transformer = ColumnTransformer(column_list) + + def fit(self, X, y=None): + self._setup_columns() + self.column_transformer.fit(X, y) + return self + def transform(self, X): + return self.column_transformer.transform(X) \ No newline at end of file