diff --git a/tests/test_custom_iniPop.py b/tests/test_custom_iniPop.py new file mode 100644 index 00000000..7eaa806a --- /dev/null +++ b/tests/test_custom_iniPop.py @@ -0,0 +1,17 @@ +from tpot import TPOTClassifier +from sklearn.datasets import load_digits +from sklearn.model_selection import train_test_split + +digits = load_digits() +X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, + train_size=0.75, test_size=0.25, random_state=42) + +individual_str1 = 'MultinomialNB(input_matrix, MultinomialNB__alpha=0.1, MultinomialNB__fit_prior=True)' +individual_str2 = 'GaussianNB(DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=17, DecisionTreeClassifier__min_samples_split=13))' +individual_str3 = 'GaussianNB(SelectFwe(CombineDFs(input_matrix, ZeroCount(input_matrix))))' + +est = TPOTClassifier(generations=3, population_size=5, verbosity=2, random_state=42, config_dict=None, + customized_initial_population=[individual_str1, individual_str2, individual_str3], + ) +est.fit(X_train, y_train) +print(est.score(X_test, y_test)) diff --git a/tpot/base.py b/tpot/base.py index dacb6df7..66ebb741 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -128,6 +128,7 @@ def __init__( verbosity=0, disable_update_check=False, log_file=None, + customized_initial_population=None, ): """Set up the genetic programming algorithm for pipeline optimization. @@ -310,6 +311,7 @@ def __init__( self.disable_update_check = disable_update_check self.random_state = random_state self.log_file = log_file + self.customized_initial_population = customized_initial_population def _setup_template(self, template): self.template = template @@ -557,6 +559,9 @@ def _setup_toolbox(self): self._toolbox.register( "population", tools.initRepeat, list, self._toolbox.individual ) + self._toolbox.register( + "customized_population", self._initPopulation_customized, customized_initial_population = self.customized_initial_population + ) self._toolbox.register("compile", self._compile_to_sklearn) self._toolbox.register("select", tools.selNSGA2) self._toolbox.register("mate", self._mate_operator) @@ -764,7 +769,10 @@ def fit(self, features, target, sample_weight=None, groups=None): # assign population, self._pop can only be not None if warm_start is enabled if not self._pop: - self._pop = self._toolbox.population(n=self.population_size) + if not self.customized_initial_population: + self._pop = self._toolbox.population(n=self.population_size) # generate initial population by default + else: + self._pop = self._toolbox.customized_population(customized_initial_population=self.customized_initial_population) # generate initial population by custom def pareto_eq(ind1, ind2): """Determine whether two individuals are equal on the Pareto front. @@ -2026,6 +2034,21 @@ def _generate(self, pset, min_, max_, condition, type_=None): stack.append((depth + 1, arg)) return expr + def _initPopulation_customized(self, customized_initial_population): + iniPop = [] # a list of pipelines + for individual_str in customized_initial_population: + individual = creator.Individual.from_string(individual_str, self._pset) # converting individual_str to individual + iniPop.append(individual) + "check if #customized initial pipelines <= #population" + if len(iniPop) <= self.population_size: + for _ in range(self.population_size - len(iniPop)): + individual_rand = self._toolbox.individual() + iniPop.append(individual_rand) + print(len(customized_initial_population), "customized pipelines +", self.population_size - len(customized_initial_population), "randomized pipelines as initial population.") + else: + raise Exception("the number of customized initial pipelines > the number of population size!") + return iniPop + @property def classes_(self):