1174 replace hard coded indices in runhistory (#1180)

* First version to update load and save * Update logging docs * Remove config_id duplication * Update RunHistory * Update docstyle * Fix example * Update docs * Add chagnes to changelog * Adapt docs * Updatre CodeStyle
automl · Jan 13, 2025 · f68ac63 · f68ac63
1 parent aed7769
commit f68ac63
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # 2.3.0
 
+## Improvements
+- Adapt RunHistory to be human readable (# 1174)
+
 ## Documentation
 - Update windows install guide (#952)
 - Correct intensifier for Algorithm Configuration Facade (#1162, #1165)

diff --git a/docs/1_installation.md b/docs/1_installation.md
@@ -1,5 +1,4 @@
 # Installation
-
 ## Requirements
 
 SMAC is written in python3 and therefore requires an environment with python>=3.8.

diff --git a/docs/advanced_usage/2_multi_fidelity.md b/docs/advanced_usage/2_multi_fidelity.md
@@ -13,5 +13,11 @@ target function but ``min_budget`` and ``max_budget`` are used internally to det
 each stage. That's also the reason why ``min_budget`` and ``max_budget`` are *not required* when using instances: 
 The ``max_budget`` is simply the max number of instances, whereas the ``min_budget`` is simply 1.
 
+!!! warning
+    ``smac.main.config_selector.ConfigSelector`` contains the ``min_trials`` parameter. This parameter determines
+    how many samples are required to train the surrogate model. If budgets are involved, the highest budgets 
+    are checked first. For example, if min_trials is three, but we find only two trials in the runhistory for
+    the highest budget, we will use trials of a lower budget instead.
+
 Please have a look into our [multi-fidelity examples](Multi-Fidelity and Multi-Instances) to see how to use
 multi-fidelity optimization in real-world applications.
diff --git a/docs/advanced_usage/8_logging.md b/docs/advanced_usage/8_logging.md
@@ -99,21 +99,22 @@ The runhistory.json in split into four parts. `stats`, `data`, `configs`, and `c
 `data` contains a list of entries, one for each configuration.
 ```json
   "data": [
-    [
-      1,                            # config_id
-      null,                         # instance or None
-      209652396,                    # seed or None
-      null,                         # budget or None
-      5.4345623938566385,           # cost
-      6.699562072753906e-05,        # time
-      6.299999999992423e-05,        # cpu_time
-      1,                            # status
-      1733133181.2144582,           # start_time
-      1733133181.21695,             # end_time
-      {}                            # additional_info
-    ],
+    {
+      "config_id": 1,
+      "instance": null,
+      "seed": 209652396,
+      "budget": 2.7777777777777777,
+      "cost": 2147483647.0,
+      "time": 0.0,
+      "cpu_time": 0.0,
+      "status": 0,
+      "starttime": 0.0,
+      "endtime": 0.0,
+      "additional_info": {}
+    },
     ...
   ]
+
 ```
 
 `configs` is a human-readable dictionary of configurations, where the keys are the one-based `config_id`. It is important to note that in `runhistory.json`, the indexing is zero-based.

diff --git a/smac/facade/hyperband_facade.py b/smac/facade/hyperband_facade.py
@@ -14,6 +14,13 @@ class HyperbandFacade(RandomFacade):
 
     Uses Random Aggressive Online Racing (ROAR) to compare configurations, a random
     initial design and the Hyperband intensifier.
+
+
+    !!! warning
+        ``smac.main.config_selector.ConfigSelector`` contains the ``min_trials`` parameter. This parameter determines
+        how many samples are required to train the surrogate model. If budgets are involved, the highest budgets
+        are checked first. For example, if min_trials is three, but we find only two trials in the runhistory for
+        the highest budget, we will use trials of a lower budget instead.
     """
 
     @staticmethod

diff --git a/smac/facade/multi_fidelity_facade.py b/smac/facade/multi_fidelity_facade.py
@@ -14,7 +14,14 @@
 
 
 class MultiFidelityFacade(HyperparameterOptimizationFacade):
-    """This facade configures SMAC in a multi-fidelity setting."""
+    """This facade configures SMAC in a multi-fidelity setting.
+
+    !!! warning
+        ``smac.main.config_selector.ConfigSelector`` contains the ``min_trials`` parameter. This parameter determines
+        how many samples are required to train the surrogate model. If budgets are involved, the highest budgets
+        are checked first. For example, if min_trials is three, but we find only two trials in the runhistory for
+        the highest budget, we will use trials of a lower budget instead.
+    """
 
     @staticmethod
     def get_intensifier(  # type: ignore

diff --git a/smac/runhistory/runhistory.py b/smac/runhistory/runhistory.py
@@ -768,25 +768,25 @@ def save(self, filename: str | Path = "runhistory.json") -> None:
         ----------
         filename : str | Path, defaults to "runhistory.json"
         """
-        data = []
+        data = list()
         for k, v in self._data.items():
-            data += [
-                (
-                    int(k.config_id),
-                    str(k.instance) if k.instance is not None else None,
-                    int(k.seed) if k.seed is not None else None,
-                    float(k.budget) if k.budget is not None else None,
-                    v.cost,
-                    v.time,
-                    v.cpu_time,
-                    v.status,
-                    v.starttime,
-                    v.endtime,
-                    v.additional_info,
-                )
-            ]
+            data.append(
+                {
+                    "config_id": int(k.config_id),
+                    "instance": str(k.instance) if k.instance is not None else None,
+                    "seed": int(k.seed) if k.seed is not None else None,
+                    "budget": float(k.budget) if k.budget is not None else None,
+                    "cost": v.cost,
+                    "time": v.time,
+                    "cpu_time": v.cpu_time,
+                    "status": v.status,
+                    "starttime": v.starttime,
+                    "endtime": v.endtime,
+                    "additional_info": v.additional_info,
+                }
+            )
 
-        config_ids_to_serialize = set([entry[0] for entry in data])
+        config_ids_to_serialize = set([entry["config_id"] for entry in data])
         configs = {}
         config_origins = {}
         for id_, config in self._ids_config.items():
@@ -858,31 +858,29 @@ def load(self, filename: str | Path, configspace: ConfigurationSpace) -> None:
         # Important to use add method to use all data structure correctly
         # NOTE: These hardcoded indices can easily lead to trouble
         for entry in data["data"]:
-            # Set n_objectives first
             if self._n_objectives == -1:
-                if isinstance(entry[4], (float, int)):
+                if isinstance(entry["cost"], (float, int)):
                     self._n_objectives = 1
                 else:
-                    self._n_objectives = len(entry[4])
+                    self._n_objectives = len(entry["cost"])
 
             cost: list[float] | float
             if self._n_objectives == 1:
-                cost = float(entry[4])
+                cost = float(entry["cost"])
             else:
-                cost = [float(x) for x in entry[4]]
-
+                cost = [float(x) for x in entry["cost"]]
             self.add(
-                config=self._ids_config[int(entry[0])],
+                config=self._ids_config[int(entry["config_id"])],
                 cost=cost,
-                time=float(entry[5]),
-                cpu_time=float(entry[6]),
-                status=StatusType(entry[7]),
-                instance=entry[1],
-                seed=entry[2],
-                budget=entry[3],
-                starttime=entry[8],
-                endtime=entry[9],
-                additional_info=entry[10],
+                time=entry["time"],
+                cpu_time=entry["cpu_time"],
+                status=StatusType(entry["status"]),
+                instance=entry["instance"],
+                seed=entry["seed"],
+                budget=entry["budget"],
+                starttime=entry["starttime"],
+                endtime=entry["endtime"],
+                additional_info=entry["additional_info"],
             )
 
         # Although adding trials should give us the same stats, the trajectory might be different