-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow for Flexible Preprocessing #897
base: development
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,7 +65,7 @@ | |
from .operator_utils import TPOTOperatorClassFactory, Operator, ARGType | ||
from .export_utils import export_pipeline, expr_to_tree, generate_pipeline_code | ||
from .decorators import _pre_test | ||
from .builtins import CombineDFs, StackingEstimator | ||
from .builtins import CombineDFs, StackingEstimator, IdentityTransformer | ||
|
||
from .config.classifier_light import classifier_config_dict_light | ||
from .config.regressor_light import regressor_config_dict_light | ||
|
@@ -110,8 +110,8 @@ class TPOTBase(BaseEstimator): | |
def __init__(self, generations=100, population_size=100, offspring_size=None, | ||
mutation_rate=0.9, crossover_rate=0.1, | ||
scoring=None, cv=5, subsample=1.0, n_jobs=1, | ||
max_time_mins=None, max_eval_time_mins=5, | ||
random_state=None, config_dict=None, template=None, | ||
max_time_mins=None, max_eval_time_mins=5, random_state=None, | ||
config_dict=None, preprocess_config_dict=None, template=None, | ||
warm_start=False, memory=None, use_dask=False, | ||
periodic_checkpoint_folder=None, early_stop=None, | ||
verbosity=0, disable_update_check=False): | ||
|
@@ -210,6 +210,10 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, | |
String 'TPOT sparse': | ||
TPOT uses a configuration dictionary with a one-hot-encoder and the | ||
operators normally included in TPOT that also support sparse matrices. | ||
preprocess_config_dict: a Python dictionary, optional (default: None) | ||
Python dictionary: | ||
A dictionary containing metadata on features for mandatory preprocessing. | ||
Examples coming soon. | ||
template: string (default: None) | ||
Template of predefined pipeline structure. The option is for specifying a desired structure | ||
for the machine learning pipeline evaluated in TPOT. So far this option only supports | ||
|
@@ -286,6 +290,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, | |
self.periodic_checkpoint_folder = periodic_checkpoint_folder | ||
self.early_stop = early_stop | ||
self.config_dict = config_dict | ||
self.preprocess_config_dict = preprocess_config_dict | ||
self.template = template | ||
self.warm_start = warm_start | ||
self.memory = memory | ||
|
@@ -391,6 +396,55 @@ def _setup_config(self, config_dict): | |
else: | ||
self._config_dict = self.default_config_dict | ||
|
||
def _setup_preprocess_config(self, config_dict): | ||
if self.template is None: | ||
self.template = 'RandomTree' | ||
elif 'PreprocessTransformer' in self.template: | ||
return | ||
if config_dict: | ||
# check for valid keys... | ||
preprocess_keys = ['text_columns', 'numeric_columns', 'categorical_columns', 'impute'] | ||
column_transform_keys = ['text_columns', 'numeric_columns', 'categorical_columns', 'text_transformer', 'categorical_transformer'] | ||
check_keys_ = [] | ||
column_transform_ = [] | ||
for k in config_dict.keys(): | ||
if k not in preprocess_keys: | ||
check_keys_.append(k) | ||
if k in column_transform_keys: | ||
column_transform_.append(k) | ||
|
||
if len(check_keys_) > 0: | ||
raise Exception( | ||
'It appears preprocessing configuration contains invalid keys: {}.' | ||
'Valid keys are text_columns, numeric_columns,' | ||
'categorical_columns, impute.'.format(', '.join(check_keys_)) | ||
) | ||
|
||
# override some settings... | ||
if config_dict.get('impute', None) is False: | ||
self._fitted_imputer = IdentityTransformer() | ||
|
||
if len(column_transform_) > 0: | ||
column_transform_dict = {} | ||
for k in ['text_columns', 'numeric_columns', 'categorical_columns']: | ||
if config_dict.get(k) is not None: | ||
column_transform_dict[k] = [config_dict[k]] | ||
for k in ['text_transformer', 'categorical_transformer']: | ||
if k in config_dict: | ||
# force transformers to be a list | ||
if config_dict[k] is list: | ||
column_transform_dict[k] = config_dict[k] | ||
else: | ||
column_transform_dict[k] = [config_dict[k]] | ||
self._config_dict['tpot.builtins.PreprocessTransformer'] = column_transform_dict | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This injection could be dangerous - do we have opinions on how it is supposed to be handled? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes this is certainly possible (and possible right now with no changes to TPOT master technically) via the use of templates - I think the question arises related to #507; if its possible to have a "built-in" configuration with text or not. Maybe the answer is we can't |
||
self.config_dict = copy(self._config_dict) | ||
if self.template is None: | ||
self.template = "PreprocessTransformer-RandomTree" | ||
else: | ||
self.template = "{}-{}".format('PreprocessTransformer', self.template) | ||
else: | ||
self._preprocess_config_dict = {} | ||
|
||
|
||
def _read_config_file(self, config_path): | ||
if os.path.isfile(config_path): | ||
|
@@ -427,11 +481,11 @@ def _setup_pset(self): | |
|
||
|
||
def _add_operators(self): | ||
main_type = ["Classifier", "Regressor", "Selector", "Transformer"] | ||
ret_types = [] | ||
self.op_list = [] | ||
if self.template == None: # default pipeline structure | ||
step_in_type = np.ndarray | ||
def add_randomtree(ret_types=[]): | ||
if len(ret_types) > 0: | ||
step_in_type = ret_types[-1] | ||
else: | ||
step_in_type = np.ndarray | ||
step_ret_type = Output_Array | ||
for operator in self.operators: | ||
arg_types = operator.parameter_types()[0][1:] | ||
|
@@ -445,6 +499,12 @@ def _add_operators(self): | |
self._pset.addPrimitive(operator, *tree_p_types) | ||
self._import_hash_and_add_terminals(operator, arg_types) | ||
self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) | ||
|
||
main_type = ["Classifier", "Regressor", "Selector", "Transformer"] | ||
ret_types = [] | ||
self.op_list = [] | ||
if self.template == None or self.template == 'RandomTree': # default pipeline structure | ||
add_randomtree() | ||
else: | ||
gp_types = {} | ||
for idx, step in enumerate(self._template_comp): | ||
|
@@ -464,6 +524,8 @@ def _add_operators(self): | |
step_ret_type = Output_Array | ||
if step == 'CombineDFs': | ||
self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) | ||
elif step == 'RandomTree': | ||
add_randomtree(ret_types) | ||
elif main_type.count(step): # if the step is a main type | ||
for operator in self.operators: | ||
arg_types = operator.parameter_types()[0][1:] | ||
|
@@ -555,6 +617,8 @@ def _fit_init(self): | |
|
||
self._setup_config(self.config_dict) | ||
|
||
self._setup_preprocess_config(self.preprocess_config_dict) | ||
|
||
self._setup_template(self.template) | ||
|
||
self.operators = [] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
|
||
import sys | ||
import os | ||
from importlib import import_module | ||
|
||
from sklearn.base import TransformerMixin, BaseEstimator | ||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer | ||
from sklearn.preprocessing import OneHotEncoder | ||
from importlib import import_module | ||
from sklearn.compose import ColumnTransformer | ||
|
||
import_loader = { | ||
'TfidfVectorizer': TfidfVectorizer, | ||
'CountVectorizer': CountVectorizer, | ||
'HashingVectorizer': HashingVectorizer, | ||
'OneHotEncoder': OneHotEncoder | ||
} | ||
|
||
|
||
|
||
def load_scoring_function(scoring_func): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. help for getting relative imports working would be appreciated here... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, directly importing via from .tpot import TPOTClassifier, TPOTRegressor
from ._version import __version__ A workaround is to move this function to |
||
""" | ||
converts mymodule.myfunc in the myfunc | ||
object itself so tpot receives a scoring function | ||
""" | ||
if scoring_func and ("." in scoring_func): | ||
try: | ||
module_name, func_name = scoring_func.rsplit('.', 1) | ||
|
||
module_path = os.getcwd() | ||
sys.path.insert(0, module_path) | ||
scoring_func = getattr(import_module(module_name), func_name) | ||
sys.path.pop(0) | ||
|
||
print('manual scoring function: {}'.format(scoring_func)) | ||
print('taken from module: {}'.format(module_name)) | ||
except Exception as e: | ||
print('failed importing custom scoring function, error: {}'.format(str(e))) | ||
raise ValueError(e) | ||
|
||
return scoring_func | ||
|
||
class IdentityTransformer(TransformerMixin, BaseEstimator): | ||
"""Identity-transformer for doing literally nothing""" | ||
def fit(self, X, y=None, **fit_params): | ||
return self | ||
def transform(self, X): | ||
return X | ||
|
||
class PreprocessTransformer(TransformerMixin): | ||
def __init__(self, numeric_columns=[], categorical_columns=[], text_columns=[], | ||
text_transformer = 'TfidfVectorizer', categorical_transformer = 'OneHotEncoder'): | ||
self.numeric_columns = numeric_columns | ||
self.categorical_columns = categorical_columns | ||
self.text_columns = text_columns | ||
self.text_transformer = text_transformer | ||
self.categorical_transformer = categorical_transformer | ||
|
||
def _setup_columns(self): | ||
def text_to_list(text): | ||
if type(text) is list: | ||
return text | ||
return [text] | ||
|
||
self.numeric_columns = text_to_list(self.numeric_columns) | ||
self.categorical_columns = text_to_list(self.categorical_columns) | ||
self.text_columns = text_to_list(self.text_columns) | ||
|
||
column_list = [] | ||
if len(self.text_columns) > 0: | ||
load_func = import_loader.get(self.text_transformer, self.text_transformer) | ||
if isinstance(load_func, str): | ||
load_func = load_scoring_function(load_func) | ||
for idx, text in enumerate(self.text_columns): | ||
column_list.append(('text' + str(idx), load_func(), text)) | ||
|
||
if len(self.numeric_columns) > 0: | ||
column_list.append(('numeric', IdentityTransformer(), self.numeric_columns)) | ||
|
||
if len(self.categorical_columns) > 0: | ||
load_func = import_loader.get(self.categorical_transformer, self.categorical_transformer) | ||
if isinstance(load_func, str): | ||
load_func = load_scoring_function(load_func) | ||
column_list.append(('categorical', load_func(), self.categorical_columns)) | ||
|
||
self.column_transformer = ColumnTransformer(column_list) | ||
|
||
def fit(self, X, y=None): | ||
self._setup_columns() | ||
self.column_transformer.fit(X, y) | ||
return self | ||
def transform(self, X): | ||
return self.column_transformer.transform(X) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This line deals with #836 - might be overloading this PR and might be an item for later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is more related to #889. I think we need add imputation into
config_dict
too. We may allow TPOT skip imputation if the pipeline only has XGBClassifier or XGBRegressor.