From 4311da84c28a09f2fd1ee3c3abca65703f9ec1a2 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 1 Mar 2024 16:04:13 +0000 Subject: [PATCH] update capcity resolution --- milabench/common.py | 9 +++++---- milabench/config.py | 28 +++++++++++++++++++++------- milabench/log.py | 12 ++++++------ milabench/merge.py | 1 - milabench/scripts/vcs.py | 1 + milabench/sizer.py | 9 +-------- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/milabench/common.py b/milabench/common.py index 70789b212..35f9cf125 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -75,7 +75,7 @@ def arguments(): # Define capabilities capabilities: Option = "" - + return CommonArguments( config, system, @@ -91,7 +91,7 @@ def arguments(): def get_multipack(args = None, run_name=None, overrides={}): if args is None: args = arguments() - + override = [ o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override ] @@ -225,13 +225,14 @@ def _get_multipack( arch = deduce_arch() base_defaults = get_base_defaults( - base=args.base, - arch=arch, + base=args.base, + arch=arch, run_name=run_name ) system_config = build_system_config( args.system, defaults={"system": base_defaults["_defaults"]["system"]}, + gpu=True ) overrides = merge({"*": system_config}, overrides) diff --git a/milabench/config.py b/milabench/config.py index fa3b85f47..5daca593c 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -174,16 +174,29 @@ def resolve_addresses(nodes): return self -def get_gpu_capacity(): - capacity = float("+inf") +def get_gpu_capacity(strict=False): + try: + capacity = 0 + + for k, v in get_gpu_info()["gpus"].items(): + capacity = min(v["memory"]["total"], capacity) + + return capacity + except: + print("GPU not available, defaulting to 0 MiB") + if strict: + raise + return 0 - for k, v in get_gpu_info()["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) - return capacity +def is_autoscale_enabled(): + return ( + os.getenv("MILABENCH_SIZER_AUTO", False) + or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None + ) -def build_system_config(config_file, defaults=None): +def build_system_config(config_file, defaults=None, gpu=True): """Load the system configuration, verify its validity and resolve ip addresses Notes @@ -204,7 +217,8 @@ def build_system_config(config_file, defaults=None): system = config.get("system", {}) - if "gpu" not in system: + # capacity is only required if batch resizer is enabled + if (gpu or is_autoscale_enabled()) and not "gpu" not in system: system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"} if system.get("sshkey") is not None: diff --git a/milabench/log.py b/milabench/log.py index 5826d309b..a6f7388a9 100644 --- a/milabench/log.py +++ b/milabench/log.py @@ -300,9 +300,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) row["gpu_load"] = f"{load}%" row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB" row["gpu_temp"] = f"{temp}C" @@ -376,9 +376,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) else: task = data.pop("task", "") units = data.pop("units", "") diff --git a/milabench/merge.py b/milabench/merge.py index e5010c629..a9efa4cec 100644 --- a/milabench/merge.py +++ b/milabench/merge.py @@ -1,6 +1,5 @@ """Utilities to merge dictionaries and other data structures.""" - from collections import deque from functools import reduce from typing import Union diff --git a/milabench/scripts/vcs.py b/milabench/scripts/vcs.py index f1a8c4ddf..0f895f886 100644 --- a/milabench/scripts/vcs.py +++ b/milabench/scripts/vcs.py @@ -1,5 +1,6 @@ """Use to retrieve GIT version info, this file cannot import milabench modules as it is executed as part of the installation process""" + import os import subprocess import warnings diff --git a/milabench/sizer.py b/milabench/sizer.py index a2aa8b87b..29ca55288 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -6,7 +6,7 @@ import numpy as np import yaml -from .config import system_global +from .config import system_global, is_autoscale_enabled from .validation.validation import ValidationLayer ROOT = os.path.dirname(__file__) @@ -14,13 +14,6 @@ default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml") -def is_autoscale_enabled(): - return ( - os.getenv("MILABENCH_SIZER_AUTO", False) - or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None - ) - - def getenv(name, type): value = os.getenv(name)