From 12dd729714dc47ed877b670e191cdf80725bcc64 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 4 Aug 2023 12:30:26 +0200 Subject: [PATCH 1/4] Add example on intensify for cross-validation. --- .../4_intensify_crossvalidation.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 examples/4_advanced_optimizer/4_intensify_crossvalidation.py diff --git a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py new file mode 100644 index 000000000..067479b87 --- /dev/null +++ b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py @@ -0,0 +1,119 @@ +""" +Speeding up Cross-Validation with Intensification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An example of optimizing a simple support vector machine on the digits dataset. In contrast to the +[simple example](examples/1_basics/2_svm_cv.py), in which all cross-validation folds are executed +at once, we use the intensification mechanism described in the original +[SMAC paper](https://link.springer.com/chapter/10.1007/978-3-642-25566-3_40) as also demonstrated +by [Auto-WEKA](https://dl.acm.org/doi/10.1145/2487575.2487629). +""" +__copyright__ = "Copyright 2023, AutoML.org Freiburg-Hannover" +__license__ = "3-clause BSD" + +N_FOLDS = 10 # Global variable that determines the number of folds + +from ConfigSpace import Configuration, ConfigurationSpace, Float +from sklearn import datasets, svm +from sklearn.model_selection import StratifiedKFold + +from smac import HyperparameterOptimizationFacade, Scenario +from smac.intensifier import Intensifier + +# We load the digits dataset, a small-scale 10-class digit recognition dataset +X, y = datasets.load_digits(return_X_y=True) + + +class SVM: + @property + def configspace(self) -> ConfigurationSpace: + # Build Configuration Space which defines all parameters and their ranges + cs = ConfigurationSpace(seed=0) + + # First we create our hyperparameters + C = Float("C", (2 ** - 5, 2 ** 15), default=1.0, log=True) + gamma = Float("gamma", (2 ** -15, 2 ** 3), default=1.0, log=True) + + # Add hyperparameters to our configspace + cs.add_hyperparameters([C, gamma]) + + return cs + + def train(self, config: Configuration, instance: str, seed: int = 0) -> float: + """Creates a SVM based on a configuration and evaluate on the given fold of the digits dataset + + Parameters + ---------- + config: Configuration + The configuration to train the SVM. + instance: str + The name of the instance this configuration should be evaluated on. This is always of type + string by definition. In our case we cast to int, but this could also be the filename of a + problem instance to be loaded. + seed: int + The seed used for this call. + """ + instance = int(instance) + config_dict = config.get_dictionary() + classifier = svm.SVC(**config_dict, random_state=seed) + splitter = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed) + for k, (train_idx, test_idx) in enumerate(splitter.split(X=X, y=y)): + if k != instance: + continue + else: + train_X = X[train_idx] + train_y = y[train_idx] + test_X = X[test_idx] + test_y = y[test_idx] + classifier.fit(train_X, train_y) + cost = 1 - classifier.score(test_X, test_y) + + return cost + + +if __name__ == "__main__": + classifier = SVM() + + # Next, we create an object, holding general information about the run + scenario = Scenario( + classifier.configspace, + n_trials=50, # We want to run max 50 trials (combination of config and seed) + instances=[f"{i}" for i in range(N_FOLDS)], # Specify all instances by their name (as a string) + deterministic=True # To simplify the problem we make SMAC believe that we have a deterministic + # optimization problem. + + ) + + # We want to run the facade's default initial design, but we want to change the number + # of initial configs to 5. + initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5) + + # Now we use SMAC to find the best hyperparameters + smac = HyperparameterOptimizationFacade( + scenario, + classifier.train, + initial_design=initial_design, + overwrite=True, # If the run exists, we overwrite it; alternatively, we can continue from last state + # The next line defines the intensifier, i.e., the module that governs the selection of + # instance-seed pairs. Since we set deterministic to True above, it only governs the instance in + # this example. Technically, it is not necessary to create the intensifier as a user, but it is + # necessary to do so because we change the argument max_config_calls (the number of instance-seed pairs + # per configuration to try) to the number of cross-validation folds, while the default would be 3. + intensifier = Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0) + ) + + incumbent = smac.optimize() + + # Get cost of default configuration + default_cost = smac.validate(classifier.configspace.get_default_configuration()) + print(f"Default cost: {default_cost}") + + # Let's calculate the cost of the incumbent + incumbent_cost = smac.validate(incumbent) + print(f"Incumbent cost: {incumbent_cost}") + + # Let's see how many configurations we have evaluated. If this number is higher than 5, we have looked + # at more configurations than would have been possible with regular cross-validation, where the number + # of configurations would be determined by the number of trials divided by the number of folds (50 / 10). + runhistory = smac.runhistory + print(f"Number of evaluated configurations: {len(runhistory.config_ids)}") \ No newline at end of file From 98f1d862d2d9496b09d8b1c2ef280f91b56a2c0a Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 8 Aug 2023 14:14:51 +0200 Subject: [PATCH 2/4] Improve example --- .../4_advanced_optimizer/4_intensify_crossvalidation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py index 067479b87..8ba112d06 100644 --- a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py +++ b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py @@ -54,8 +54,7 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float: The seed used for this call. """ instance = int(instance) - config_dict = config.get_dictionary() - classifier = svm.SVC(**config_dict, random_state=seed) + classifier = svm.SVC(**config, random_state=seed) splitter = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed) for k, (train_idx, test_idx) in enumerate(splitter.split(X=X, y=y)): if k != instance: @@ -79,6 +78,7 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float: classifier.configspace, n_trials=50, # We want to run max 50 trials (combination of config and seed) instances=[f"{i}" for i in range(N_FOLDS)], # Specify all instances by their name (as a string) + instance_features={f"{i}": [i] for i in range(N_FOLDS)}, # breaks SMAC deterministic=True # To simplify the problem we make SMAC believe that we have a deterministic # optimization problem. @@ -99,7 +99,7 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float: # this example. Technically, it is not necessary to create the intensifier as a user, but it is # necessary to do so because we change the argument max_config_calls (the number of instance-seed pairs # per configuration to try) to the number of cross-validation folds, while the default would be 3. - intensifier = Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0) + intensifier=Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0) ) incumbent = smac.optimize() From 3532150c3356a13f2f4d34b4b187c2c48ffe27c1 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 28 Nov 2023 14:30:30 +0100 Subject: [PATCH 3/4] Address Difan's comment --- .../4_intensify_crossvalidation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py index 8ba112d06..d215dd8ec 100644 --- a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py +++ b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py @@ -6,7 +6,11 @@ [simple example](examples/1_basics/2_svm_cv.py), in which all cross-validation folds are executed at once, we use the intensification mechanism described in the original [SMAC paper](https://link.springer.com/chapter/10.1007/978-3-642-25566-3_40) as also demonstrated -by [Auto-WEKA](https://dl.acm.org/doi/10.1145/2487575.2487629). +by [Auto-WEKA](https://dl.acm.org/doi/10.1145/2487575.2487629). This mechanism allows us to +terminate the evaluation of a configuration if after a certain number of folds, the configuration +is found to be worse than the incumbent configuration. This is especially useful if the evaluation +of a configuration is expensive, e.g., if we have to train a neural network or if we have to +evaluate the configuration on a large dataset. """ __copyright__ = "Copyright 2023, AutoML.org Freiburg-Hannover" __license__ = "3-clause BSD" @@ -76,7 +80,11 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float: # Next, we create an object, holding general information about the run scenario = Scenario( classifier.configspace, - n_trials=50, # We want to run max 50 trials (combination of config and seed) + n_trials=50, # We want to run max 50 trials (combination of config and instances in the case of + # deterministic=True. In the case of deterministic=False, this would be the + # combination of instances, seeds and configs). The number of distinct configurations + # evaluated by SMAC will be lower than this number because some of the configurations + # will be executed on more than one instance (CV fold). instances=[f"{i}" for i in range(N_FOLDS)], # Specify all instances by their name (as a string) instance_features={f"{i}": [i] for i in range(N_FOLDS)}, # breaks SMAC deterministic=True # To simplify the problem we make SMAC believe that we have a deterministic From af897fa3b005e8c666fdf3b2214ddabd2951fe8d Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 8 Feb 2024 10:36:04 +0100 Subject: [PATCH 4/4] Update CHANGELOG.md --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbcac80e4..adec41194 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ # 2.0.2 ## Improvements -- Add an error when we get an empty dict data_to_scatter so that we can avoid an internal error caused in Dask precautiously +- Add an error when we get an empty dict data_to_scatter so that we can avoid an internal error caused in Dask precautiously. - Add experimental instruction for installing SMAC in Windows via a WSL. - More detailed documentation regarding continuing runs. +- Add a new example that demonstrates the use of intensification to speed up cross-validation for machine learning. ## Bugfixes - Fix bug in the incumbent selection in the case that multi-fidelity is combined with multi-objective (#1019).