Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow for Flexible Preprocessing #897

Open
wants to merge 4 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 72 additions & 8 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
from .operator_utils import TPOTOperatorClassFactory, Operator, ARGType
from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code
from .decorators import _pre_test
from .builtins import CombineDFs, StackingEstimator
from .builtins import CombineDFs, StackingEstimator, IdentityTransformer

from .config.classifier_light import classifier_config_dict_light
from .config.regressor_light import regressor_config_dict_light
Expand Down Expand Up @@ -110,8 +110,8 @@ class TPOTBase(BaseEstimator):
def __init__(self, generations=100, population_size=100, offspring_size=None,
mutation_rate=0.9, crossover_rate=0.1,
scoring=None, cv=5, subsample=1.0, n_jobs=1,
max_time_mins=None, max_eval_time_mins=5,
random_state=None, config_dict=None, template=None,
max_time_mins=None, max_eval_time_mins=5, random_state=None,
config_dict=None, preprocess_config_dict=None, template=None,
warm_start=False, memory=None, use_dask=False,
periodic_checkpoint_folder=None, early_stop=None,
verbosity=0, disable_update_check=False):
Expand Down Expand Up @@ -210,6 +210,10 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
String 'TPOT sparse':
TPOT uses a configuration dictionary with a one-hot-encoder and the
operators normally included in TPOT that also support sparse matrices.
preprocess_config_dict: a Python dictionary, optional (default: None)
Python dictionary:
A dictionary containing metadata on features for mandatory preprocessing.
Examples coming soon.
template: string (default: None)
Template of predefined pipeline structure. The option is for specifying a desired structure
for the machine learning pipeline evaluated in TPOT. So far this option only supports
Expand Down Expand Up @@ -286,6 +290,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
self.periodic_checkpoint_folder = periodic_checkpoint_folder
self.early_stop = early_stop
self.config_dict = config_dict
self.preprocess_config_dict = preprocess_config_dict
self.template = template
self.warm_start = warm_start
self.memory = memory
Expand Down Expand Up @@ -391,6 +396,55 @@ def _setup_config(self, config_dict):
else:
self._config_dict = self.default_config_dict

def _setup_preprocess_config(self, config_dict):
if self.template is None:
self.template = 'RandomTree'
elif 'PreprocessTransformer' in self.template:
return
if config_dict:
# check for valid keys...
preprocess_keys = ['text_columns', 'numeric_columns', 'categorical_columns', 'impute']
column_transform_keys = ['text_columns', 'numeric_columns', 'categorical_columns', 'text_transformer', 'categorical_transformer']
check_keys_ = []
column_transform_ = []
for k in config_dict.keys():
if k not in preprocess_keys:
check_keys_.append(k)
if k in column_transform_keys:
column_transform_.append(k)

if len(check_keys_) > 0:
raise Exception(
'It appears preprocessing configuration contains invalid keys: {}.'
'Valid keys are text_columns, numeric_columns,'
'categorical_columns, impute.'.format(', '.join(check_keys_))
)

# override some settings...
if config_dict.get('impute', None) is False:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line deals with #836 - might be overloading this PR and might be an item for later?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is more related to #889. I think we need add imputation into config_dict too. We may allow TPOT skip imputation if the pipeline only has XGBClassifier or XGBRegressor.

self._fitted_imputer = IdentityTransformer()

if len(column_transform_) > 0:
column_transform_dict = {}
for k in ['text_columns', 'numeric_columns', 'categorical_columns']:
if config_dict.get(k) is not None:
column_transform_dict[k] = [config_dict[k]]
for k in ['text_transformer', 'categorical_transformer']:
if k in config_dict:
# force transformers to be a list
if config_dict[k] is list:
column_transform_dict[k] = config_dict[k]
else:
column_transform_dict[k] = [config_dict[k]]
self._config_dict['tpot.builtins.PreprocessTransformer'] = column_transform_dict
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This injection could be dangerous - do we have opinions on how it is supposed to be handled?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think preprocess_config_dict should be a argument within PreprocessTransformer instead of TPOT. And users should be able to customize it via config_dict.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is certainly possible (and possible right now with no changes to TPOT master technically) via the use of templates - I think the question arises related to #507; if its possible to have a "built-in" configuration with text or not.

Maybe the answer is we can't

self.config_dict = copy(self._config_dict)
if self.template is None:
self.template = "PreprocessTransformer-RandomTree"
else:
self.template = "{}-{}".format('PreprocessTransformer', self.template)
else:
self._preprocess_config_dict = {}


def _read_config_file(self, config_path):
if os.path.isfile(config_path):
Expand Down Expand Up @@ -427,11 +481,11 @@ def _setup_pset(self):


def _add_operators(self):
main_type = ["Classifier", "Regressor", "Selector", "Transformer"]
ret_types = []
self.op_list = []
if self.template == None: # default pipeline structure
step_in_type = np.ndarray
def add_randomtree(ret_types=[]):
if len(ret_types) > 0:
step_in_type = ret_types[-1]
else:
step_in_type = np.ndarray
step_ret_type = Output_Array
for operator in self.operators:
arg_types = operator.parameter_types()[0][1:]
Expand All @@ -445,6 +499,12 @@ def _add_operators(self):
self._pset.addPrimitive(operator, *tree_p_types)
self._import_hash_and_add_terminals(operator, arg_types)
self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type)

main_type = ["Classifier", "Regressor", "Selector", "Transformer"]
ret_types = []
self.op_list = []
if self.template == None or self.template == 'RandomTree': # default pipeline structure
add_randomtree()
else:
gp_types = {}
for idx, step in enumerate(self._template_comp):
Expand All @@ -464,6 +524,8 @@ def _add_operators(self):
step_ret_type = Output_Array
if step == 'CombineDFs':
self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type)
elif step == 'RandomTree':
add_randomtree(ret_types)
elif main_type.count(step): # if the step is a main type
for operator in self.operators:
arg_types = operator.parameter_types()[0][1:]
Expand Down Expand Up @@ -555,6 +617,8 @@ def _fit_init(self):

self._setup_config(self.config_dict)

self._setup_preprocess_config(self.preprocess_config_dict)

self._setup_template(self.template)

self.operators = []
Expand Down
1 change: 1 addition & 0 deletions tpot/builtins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@
from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected
from .feature_transformers import CategoricalSelector, ContinuousSelector
from .feature_set_selector import FeatureSetSelector
from .preprocessing import IdentityTransformer, PreprocessTransformer
93 changes: 93 additions & 0 deletions tpot/builtins/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@

import sys
import os
from importlib import import_module

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.preprocessing import OneHotEncoder
from importlib import import_module
from sklearn.compose import ColumnTransformer

import_loader = {
'TfidfVectorizer': TfidfVectorizer,
'CountVectorizer': CountVectorizer,
'HashingVectorizer': HashingVectorizer,
'OneHotEncoder': OneHotEncoder
}



def load_scoring_function(scoring_func):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

help for getting relative imports working would be appreciated here... tpot.drivers.load_scoring_function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, directly importing via from ..driver import load_scoring_function will cause some conflicts with

from .tpot import TPOTClassifier, TPOTRegressor
from ._version import __version__

A workaround is to move this function to tpot/metrics.py and then add from ..metrics import load_scoring_function to tpot/builtins/preprocessing.py

"""
converts mymodule.myfunc in the myfunc
object itself so tpot receives a scoring function
"""
if scoring_func and ("." in scoring_func):
try:
module_name, func_name = scoring_func.rsplit('.', 1)

module_path = os.getcwd()
sys.path.insert(0, module_path)
scoring_func = getattr(import_module(module_name), func_name)
sys.path.pop(0)

print('manual scoring function: {}'.format(scoring_func))
print('taken from module: {}'.format(module_name))
except Exception as e:
print('failed importing custom scoring function, error: {}'.format(str(e)))
raise ValueError(e)

return scoring_func

class IdentityTransformer(TransformerMixin, BaseEstimator):
"""Identity-transformer for doing literally nothing"""
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X

class PreprocessTransformer(TransformerMixin):
def __init__(self, numeric_columns=[], categorical_columns=[], text_columns=[],
text_transformer = 'TfidfVectorizer', categorical_transformer = 'OneHotEncoder'):
self.numeric_columns = numeric_columns
self.categorical_columns = categorical_columns
self.text_columns = text_columns
self.text_transformer = text_transformer
self.categorical_transformer = categorical_transformer

def _setup_columns(self):
def text_to_list(text):
if type(text) is list:
return text
return [text]

self.numeric_columns = text_to_list(self.numeric_columns)
self.categorical_columns = text_to_list(self.categorical_columns)
self.text_columns = text_to_list(self.text_columns)

column_list = []
if len(self.text_columns) > 0:
load_func = import_loader.get(self.text_transformer, self.text_transformer)
if isinstance(load_func, str):
load_func = load_scoring_function(load_func)
for idx, text in enumerate(self.text_columns):
column_list.append(('text' + str(idx), load_func(), text))

if len(self.numeric_columns) > 0:
column_list.append(('numeric', IdentityTransformer(), self.numeric_columns))

if len(self.categorical_columns) > 0:
load_func = import_loader.get(self.categorical_transformer, self.categorical_transformer)
if isinstance(load_func, str):
load_func = load_scoring_function(load_func)
column_list.append(('categorical', load_func(), self.categorical_columns))

self.column_transformer = ColumnTransformer(column_list)

def fit(self, X, y=None):
self._setup_columns()
self.column_transformer.fit(X, y)
return self
def transform(self, X):
return self.column_transformer.transform(X)