From 12dd729714dc47ed877b670e191cdf80725bcc64 Mon Sep 17 00:00:00 2001
From: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Date: Fri, 4 Aug 2023 12:30:26 +0200
Subject: [PATCH 1/4] Add example on intensify for cross-validation.

---
 .../4_intensify_crossvalidation.py            | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 examples/4_advanced_optimizer/4_intensify_crossvalidation.py

diff --git a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
new file mode 100644
index 000000000..067479b87
--- /dev/null
+++ b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
@@ -0,0 +1,119 @@
+"""
+Speeding up Cross-Validation with Intensification
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An example of optimizing a simple support vector machine on the digits dataset. In contrast to the
+[simple example](examples/1_basics/2_svm_cv.py), in which all cross-validation folds are executed
+at once, we use the intensification mechanism described in the original 
+[SMAC paper](https://link.springer.com/chapter/10.1007/978-3-642-25566-3_40) as also demonstrated
+by [Auto-WEKA](https://dl.acm.org/doi/10.1145/2487575.2487629). 
+"""
+__copyright__ = "Copyright 2023, AutoML.org Freiburg-Hannover"
+__license__ = "3-clause BSD"
+
+N_FOLDS = 10  # Global variable that determines the number of folds
+
+from ConfigSpace import Configuration, ConfigurationSpace, Float
+from sklearn import datasets, svm
+from sklearn.model_selection import StratifiedKFold
+
+from smac import HyperparameterOptimizationFacade, Scenario
+from smac.intensifier import Intensifier
+
+# We load the digits dataset, a small-scale 10-class digit recognition dataset
+X, y = datasets.load_digits(return_X_y=True)
+
+
+class SVM:
+    @property
+    def configspace(self) -> ConfigurationSpace:
+        # Build Configuration Space which defines all parameters and their ranges
+        cs = ConfigurationSpace(seed=0)
+
+        # First we create our hyperparameters
+        C = Float("C", (2 ** - 5, 2 ** 15), default=1.0, log=True)
+        gamma = Float("gamma", (2 ** -15, 2 ** 3), default=1.0, log=True)
+
+        # Add hyperparameters to our configspace
+        cs.add_hyperparameters([C, gamma])
+
+        return cs
+
+    def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
+        """Creates a SVM based on a configuration and evaluate on the given fold of the digits dataset
+        
+        Parameters
+        ----------
+        config: Configuration
+            The configuration to train the SVM.
+        instance: str
+            The name of the instance this configuration should be evaluated on. This is always of type
+            string by definition. In our case we cast to int, but this could also be the filename of a
+            problem instance to be loaded.
+        seed: int
+            The seed used for this call.
+        """
+        instance = int(instance)
+        config_dict = config.get_dictionary()
+        classifier = svm.SVC(**config_dict, random_state=seed)
+        splitter = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
+        for k, (train_idx, test_idx) in enumerate(splitter.split(X=X, y=y)):
+            if k != instance:
+                continue
+            else:
+                train_X = X[train_idx]
+                train_y = y[train_idx]
+                test_X = X[test_idx]
+                test_y = y[test_idx]
+                classifier.fit(train_X, train_y)
+                cost = 1 - classifier.score(test_X, test_y)
+
+        return cost
+
+
+if __name__ == "__main__":
+    classifier = SVM()
+
+    # Next, we create an object, holding general information about the run
+    scenario = Scenario(
+        classifier.configspace,
+        n_trials=50,  # We want to run max 50 trials (combination of config and seed)
+        instances=[f"{i}" for i in range(N_FOLDS)],  # Specify all instances by their name (as a string)
+        deterministic=True  # To simplify the problem we make SMAC believe that we have a deterministic
+                            # optimization problem.
+        
+    )
+
+    # We want to run the facade's default initial design, but we want to change the number
+    # of initial configs to 5.
+    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)
+
+    # Now we use SMAC to find the best hyperparameters
+    smac = HyperparameterOptimizationFacade(
+        scenario,
+        classifier.train,
+        initial_design=initial_design,
+        overwrite=True,  # If the run exists, we overwrite it; alternatively, we can continue from last state
+        # The next line defines the intensifier, i.e., the module that governs the selection of 
+        # instance-seed pairs. Since we set deterministic to True above, it only governs the instance in
+        # this example. Technically, it is not necessary to create the intensifier as a user, but it is
+        # necessary to do so because we change the argument max_config_calls (the number of instance-seed pairs
+        # per configuration to try) to the number of cross-validation folds, while the default would be 3.
+        intensifier = Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0)
+    )
+
+    incumbent = smac.optimize()
+
+    # Get cost of default configuration
+    default_cost = smac.validate(classifier.configspace.get_default_configuration())
+    print(f"Default cost: {default_cost}")
+
+    # Let's calculate the cost of the incumbent
+    incumbent_cost = smac.validate(incumbent)
+    print(f"Incumbent cost: {incumbent_cost}")
+
+    # Let's see how many configurations we have evaluated. If this number is higher than 5, we have looked
+    # at more configurations than would have been possible with regular cross-validation, where the number
+    # of configurations would be determined by the number of trials divided by the number of folds (50 / 10).
+    runhistory = smac.runhistory
+    print(f"Number of evaluated configurations: {len(runhistory.config_ids)}")
\ No newline at end of file

From 98f1d862d2d9496b09d8b1c2ef280f91b56a2c0a Mon Sep 17 00:00:00 2001
From: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Date: Tue, 8 Aug 2023 14:14:51 +0200
Subject: [PATCH 2/4] Improve example

---
 .../4_advanced_optimizer/4_intensify_crossvalidation.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
index 067479b87..8ba112d06 100644
--- a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
+++ b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
@@ -54,8 +54,7 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
             The seed used for this call.
         """
         instance = int(instance)
-        config_dict = config.get_dictionary()
-        classifier = svm.SVC(**config_dict, random_state=seed)
+        classifier = svm.SVC(**config, random_state=seed)
         splitter = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
         for k, (train_idx, test_idx) in enumerate(splitter.split(X=X, y=y)):
             if k != instance:
@@ -79,6 +78,7 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
         classifier.configspace,
         n_trials=50,  # We want to run max 50 trials (combination of config and seed)
         instances=[f"{i}" for i in range(N_FOLDS)],  # Specify all instances by their name (as a string)
+        instance_features={f"{i}": [i] for i in range(N_FOLDS)}, # breaks SMAC
         deterministic=True  # To simplify the problem we make SMAC believe that we have a deterministic
                             # optimization problem.
         
@@ -99,7 +99,7 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
         # this example. Technically, it is not necessary to create the intensifier as a user, but it is
         # necessary to do so because we change the argument max_config_calls (the number of instance-seed pairs
         # per configuration to try) to the number of cross-validation folds, while the default would be 3.
-        intensifier = Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0)
+        intensifier=Intensifier(scenario=scenario, max_config_calls=N_FOLDS, seed=0)
     )
 
     incumbent = smac.optimize()

From 3532150c3356a13f2f4d34b4b187c2c48ffe27c1 Mon Sep 17 00:00:00 2001
From: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Date: Tue, 28 Nov 2023 14:30:30 +0100
Subject: [PATCH 3/4] Address Difan's comment

---
 .../4_intensify_crossvalidation.py                   | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
index 8ba112d06..d215dd8ec 100644
--- a/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
+++ b/examples/4_advanced_optimizer/4_intensify_crossvalidation.py
@@ -6,7 +6,11 @@
 [simple example](examples/1_basics/2_svm_cv.py), in which all cross-validation folds are executed
 at once, we use the intensification mechanism described in the original 
 [SMAC paper](https://link.springer.com/chapter/10.1007/978-3-642-25566-3_40) as also demonstrated
-by [Auto-WEKA](https://dl.acm.org/doi/10.1145/2487575.2487629). 
+by [Auto-WEKA](https://dl.acm.org/doi/10.1145/2487575.2487629). This mechanism allows us to
+terminate the evaluation of a configuration if after a certain number of folds, the configuration
+is found to be worse than the incumbent configuration. This is especially useful if the evaluation
+of a configuration is expensive, e.g., if we have to train a neural network or if we have to
+evaluate the configuration on a large dataset.
 """
 __copyright__ = "Copyright 2023, AutoML.org Freiburg-Hannover"
 __license__ = "3-clause BSD"
@@ -76,7 +80,11 @@ def train(self, config: Configuration, instance: str, seed: int = 0) -> float:
     # Next, we create an object, holding general information about the run
     scenario = Scenario(
         classifier.configspace,
-        n_trials=50,  # We want to run max 50 trials (combination of config and seed)
+        n_trials=50,  # We want to run max 50 trials (combination of config and instances in the case of
+                      # deterministic=True. In the case of deterministic=False, this would be the
+                      # combination of instances, seeds and configs). The number of distinct configurations
+                      # evaluated by SMAC will be lower than this number because some of the configurations
+                      # will be executed on more than one instance (CV fold).
         instances=[f"{i}" for i in range(N_FOLDS)],  # Specify all instances by their name (as a string)
         instance_features={f"{i}": [i] for i in range(N_FOLDS)}, # breaks SMAC
         deterministic=True  # To simplify the problem we make SMAC believe that we have a deterministic

From af897fa3b005e8c666fdf3b2214ddabd2951fe8d Mon Sep 17 00:00:00 2001
From: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Date: Thu, 8 Feb 2024 10:36:04 +0100
Subject: [PATCH 4/4] Update CHANGELOG.md

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dbcac80e4..adec41194 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,10 @@
 # 2.0.2
 
 ## Improvements
-- Add an error when we get an empty dict data_to_scatter so that we can avoid an internal error caused in Dask precautiously
+- Add an error when we get an empty dict data_to_scatter so that we can avoid an internal error caused in Dask precautiously.
 - Add experimental instruction for installing SMAC in Windows via a WSL.
 - More detailed documentation regarding continuing runs.
+- Add a new example that demonstrates the use of intensification to speed up cross-validation for machine learning.
 
 ## Bugfixes
 - Fix bug in the incumbent selection in the case that multi-fidelity is combined with multi-objective (#1019).