-

mila-iqia · Feb 27, 2024 · 015ce01 · 015ce01
1 parent 0bf6348
commit 015ce01
Show file tree

Hide file tree

Showing 8 changed files with 261 additions and 34 deletions.
diff --git a/docs/process.rst b/docs/process.rst
@@ -0,0 +1,104 @@
+Request For proposal
+====================
+
+Preparing
+---------
+
+1. Make sure milabench support the targetted hardware
+
+   * NVIDIA
+   * AMD
+
+2. Create a milabench configuration for your RFP
+   Milabench comes with a wide variety of benchmarks.
+   You should select and weight each benchmarks according to your
+   target hardware.
+
+.. code-block:: yaml
+
+   include:
+     - base.yaml
+
+   llama:
+     enabled: true
+     weight: 1.0
+
+   resnet50:
+     enabled: true
+     weight: 1.0
+
+
+.. code-block:: yaml
+
+   milabench resolve myconfig.yaml > RFP.yaml
+
+
+3. Prepare a container for your RFP
+
+
+.. code-block::
+
+   FROM milabench:cuda-v1.2.3
+
+   COPY RFP.yaml .../RFP.yaml
+
+   ENV MILABENCH_CONFIG=".../RFP.yaml
+
+   CMD milabench run
+
+
+4.  Hot fixes
+
+   * Disable a benchmarks
+   * update container
+
+
+Vendor Instructions
+-------------------
+
+1. Vendor needs to create a system configuration that will
+   specify the different compute nodes that will be used by milabench
+
+.. code-block::
+
+   system:
+      sshkey: <privatekey>
+      arch: cuda
+      docker_image: ghcr.io/mila-iqia/milabench:cuda-nightly
+
+      nodes:
+         - name: node1
+            ip: 192.168.0.25
+            main: true
+            port: 8123
+            user: <username>
+
+         - name: node2
+            ip: 192.168.0.26
+            main: false
+            user: <username>
+
+
+2. Run milabench
+
+.. code-block::
+
+   export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly
+
+   # create ...
+   mkdir -p configs
+   mkdir -p results
+
+   # put your vendor specific configuration
+   vi configs/system.yaml
+
+   #
+   docker pull $MILABENCH_IMAGE
+
+   # run milabench
+   docker run -it --rm --gpus all --network host --ipc=host --privileged \
+      -v $SSH_KEY_FILE:/milabench/id_milabench        \
+      -v $(pwd)/results:/milabench/envs/runs          \
+      -v $(pwd)/configs:/milabench/envs/configs       \
+      $MILABENCH_IMAGE                                \
+      milabench run --system /milabench/envs/configs/system.yaml
diff --git a/milabench/_version.py b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.0.6-41-g932e30e"
-__commit__ = "932e30e79513fdd2448cedaf98a003bb4b5b9148"
-__date__ = "2024-01-17 14:33:14 -0500"
+__tag__ = "paice-v1"
+__commit__ = "0bf63487d2b99d46c2c205a65b9b5b6c6e298e43"
+__date__ = "2024-02-26 12:53:49 -0500"
diff --git a/milabench/cli/resolve.py b/milabench/cli/resolve.py
@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+
+from coleo import Option, tooled
+
+from milabench.config import _config_layers, merge
+
+
+# fmt: off
+@dataclass
+class Arguments:
+    config  : str
+# fmt: on
+
+
+@tooled
+def arguments():
+    # The name of the benchmark to develop
+    config: str
+
+    return Arguments(config)
+
+
+@tooled
+def cli_resolve(args=None):
+    """Generate a configuration"""
+
+    if args is None:
+        args = arguments()
+
+    overrides = {}
+    configs = [args.config, overrides]
+
+    config = {}
+    for layer in _config_layers(configs):
+        config = merge(config, layer)
+
+    wip_config = {}
+    parents = []
+
+    #
+    #   Only keep enabled benchmarks
+    #
+    for benchname, benchconfig in config.items():
+        is_enabled = benchconfig.get("enabled", False)
+        parent = benchconfig.get("inherits", None)
+
+        if parent:
+            parents.append(parent)
+
+        if is_enabled:
+            wip_config[benchname] = benchconfig
+
+    #
+    #   Keep the parents as well
+    #
+    parents = set(parents)
+    for parent in parents:
+        wip_config[parent] = config[parent]
+
+    #
+    # Remove resolved fields
+    #
+    resolved = ["dirs", "config_file", "config_base"]
+    for benchname, benchconfig in wip_config.items():
+        for field in resolved:
+            benchconfig.pop(field, None)
+
+    #
+    # Finished
+    #
+
+    import yaml
+
+    print(yaml.dump(wip_config))
+
+
+if __name__ == "__main__":
+    args = Arguments("/workspaces/milabench/config/standard.yaml")
+
+    cli_resolve(args)
diff --git a/milabench/log.py b/milabench/log.py
@@ -300,9 +300,9 @@ def on_data(self, entry, data, row):
                 load = int(data.get("load", 0) * 100)
                 currm, totalm = data.get("memory", [0, 0])
                 temp = int(data.get("temperature", 0))
-                row[
-                    f"gpu:{gpuid}"
-                ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                row[f"gpu:{gpuid}"] = (
+                    f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                )
                 row["gpu_load"] = f"{load}%"
                 row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
                 row["gpu_temp"] = f"{temp}C"
@@ -376,9 +376,9 @@ def on_data(self, entry, data, row):
                 load = int(data.get("load", 0) * 100)
                 currm, totalm = data.get("memory", [0, 0])
                 temp = int(data.get("temperature", 0))
-                row[
-                    f"gpu:{gpuid}"
-                ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                row[f"gpu:{gpuid}"] = (
+                    f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                )
         else:
             task = data.pop("task", "")
             units = data.pop("units", "")

diff --git a/milabench/merge.py b/milabench/merge.py
@@ -1,6 +1,5 @@
 """Utilities to merge dictionaries and other data structures."""
 
-
 from collections import deque
 from functools import reduce
 from typing import Union

diff --git a/milabench/report.py b/milabench/report.py
@@ -6,42 +6,51 @@
 from pandas import DataFrame
 
 from milabench.utils import error_guard
+from milabench.summary import Summary
 
 nan = math.nan
 
 H = HTML()
 
 
 @error_guard({})
-def _make_row(summary, compare, weights):
+def _make_row(summary, compare, config):
     mkey = "train_rate"
     metric = "mean"
-    row = {}
+    row = {
+        "n": nan,
+        "fail": nan,
+        "perf": nan,
+        "std%": nan,
+        "sem%": nan,
+        "peak_memory": nan,
+        "score": nan,
+        "weight": config.get("weight", summary.get("weight", 0)),
+        "enabled": config.get("enabled", summary.get("enabled", 0)),
+    }
+
+    if not summary:
+        return row
 
-    row["n"] = summary["n"] if summary else nan
-    row["fail"] = summary["failures"] if summary else nan
-    row["perf"] = summary[mkey][metric] if summary else nan
+    #
+    row["n"] = summary["n"]
+    row["fail"] = summary["failures"] + row["n"] > 0
+    row["perf"] = summary[mkey][metric]
 
     if compare:
         row["perf_base"] = compare[mkey][metric]
         row["perf_ratio"] = row["perf_adj"] / row["perf_base"]
 
-    row["std%"] = summary[mkey]["std"] / summary[mkey][metric] if summary else nan
-    row["sem%"] = summary[mkey]["sem"] / summary[mkey][metric] if summary else nan
+    row["std%"] = summary[mkey]["std"] / summary[mkey][metric]
+    row["sem%"] = summary[mkey]["sem"] / summary[mkey][metric]
     # row["iqr%"] = (summary[mkey]["q3"] - summary[mkey]["q1"]) / summary[mkey]["median"] if summary else nan
-    row["peak_memory"] = (
-        max(
-            (data["memory"]["max"] for data in summary["gpu_load"].values())
-            if summary["gpu_load"]
-            else [-1]
-        )
-        if summary
-        else nan
-    )
+
+    if summary["gpu_load"]:
+        memory = [data["memory"]["max"] for data in summary["gpu_load"].values()]
+        row["peak_memory"] = max(memory)
 
     # Sum of all the GPU performance
     # to get the overall perf of the whole machine
-
     if "per_gpu" in summary:
         acc = 0
         for _, metrics in summary["per_gpu"].items():
@@ -51,10 +60,7 @@ def _make_row(summary, compare, weights):
 
     success_ratio = 1 - row["fail"] / row["n"]
     score = (acc if acc > 0 else row["perf"]) * success_ratio
-
     row["score"] = score
-    row["weight"] = weights.get("weight", summary.get("weight", 0))
-    # ----
 
     return row
 
@@ -187,6 +193,15 @@ def _report_pergpu(entries, measure="50"):
 
 
 def make_dataframe(summary, compare=None, weights=None):
+    if weights is not None:
+        # We overriden the config
+        required = weights.keys()
+
+        for key in required:
+            if key not in summary:
+                print(f"Missing benchmark {key}")
+                summary[key] = {"name": key, "n": 0, "successes": 0, "failures": 0}
+
     if weights is None:
         weights = dict()
 
@@ -214,7 +229,7 @@ def make_dataframe(summary, compare=None, weights=None):
 
 @error_guard({})
 def make_report(
-    summary,
+    summary: dict[str, Summary],
     compare=None,
     html=None,
     compare_gpus=False,
@@ -248,7 +263,7 @@ def make_report(
     def _score(column):
         # This computes a weighted geometric mean
         perf = df[column]
-        weights = df["weight"]
+        weights = df["weight"] * df["enabled"].astype(int)
         logscore = np.sum(np.log(perf) * weights) / np.sum(weights)
         return np.exp(logscore)
 

diff --git a/milabench/scripts/vcs.py b/milabench/scripts/vcs.py
@@ -1,5 +1,6 @@
 """Use to retrieve GIT version info, this file cannot import milabench modules
 as it is executed as part of the installation process"""
+
 import os
 import subprocess
 import warnings