From 66e9e155a3bebe5d3e7eb6c43faa7611dc89bfad Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 1 Mar 2024 16:04:13 +0000 Subject: [PATCH] update capcity resolution --- .github/workflows/tests.yml | 4 ++-- config/base.yaml | 2 +- milabench/cli/publish.py | 17 ++++++++--------- milabench/common.py | 9 +++++---- milabench/config.py | 29 ++++++++++++++++++++++------- milabench/log.py | 12 ++++++------ milabench/merge.py | 1 - milabench/scripts/vcs.py | 1 + milabench/sizer.py | 14 +++++--------- 9 files changed, 50 insertions(+), 39 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1192bd0d7..7d456f9bb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,9 +26,9 @@ jobs: matrix: include: - arch: cuda - exclude : "no-cuda" + exclude : "unsupported-cuda" # - arch: rocm - # exclude : "no-rocm" + # exclude : "unsupported-rocm" runs-on: [self-hosted, "${{ matrix.arch }}"] diff --git a/config/base.yaml b/config/base.yaml index e5043e8e4..ddb804cdc 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -521,7 +521,7 @@ rwkv: tags: - llm - rnn - - no-rocm + - unsupported-rocm plan: method: per_gpu argv: diff --git a/milabench/cli/publish.py b/milabench/cli/publish.py index cb60812d3..077cda9cd 100644 --- a/milabench/cli/publish.py +++ b/milabench/cli/publish.py @@ -1,19 +1,18 @@ -import re import json -import subprocess -from contextlib import contextmanager import multiprocessing -from dataclasses import dataclass -from urllib.parse import urlparse, ParseResult -import time -import threading -import signal import os +import re +import signal +import subprocess import sys +import threading +import time +from contextlib import contextmanager +from dataclasses import dataclass +from urllib.parse import ParseResult, urlparse from coleo import Option, tooled - SLEEP = 0.01 _INIT = 0 _READY = 1 diff --git a/milabench/common.py b/milabench/common.py index 70789b212..35f9cf125 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -75,7 +75,7 @@ def arguments(): # Define capabilities capabilities: Option = "" - + return CommonArguments( config, system, @@ -91,7 +91,7 @@ def arguments(): def get_multipack(args = None, run_name=None, overrides={}): if args is None: args = arguments() - + override = [ o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override ] @@ -225,13 +225,14 @@ def _get_multipack( arch = deduce_arch() base_defaults = get_base_defaults( - base=args.base, - arch=arch, + base=args.base, + arch=arch, run_name=run_name ) system_config = build_system_config( args.system, defaults={"system": base_defaults["_defaults"]["system"]}, + gpu=True ) overrides = merge({"*": system_config}, overrides) diff --git a/milabench/config.py b/milabench/config.py index fa3b85f47..b6617688b 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -1,4 +1,5 @@ import contextvars +import os import socket import psutil @@ -174,16 +175,29 @@ def resolve_addresses(nodes): return self -def get_gpu_capacity(): - capacity = float("+inf") +def get_gpu_capacity(strict=False): + try: + capacity = 0 + + for k, v in get_gpu_info()["gpus"].items(): + capacity = min(v["memory"]["total"], capacity) + + return capacity + except: + print("GPU not available, defaulting to 0 MiB") + if strict: + raise + return 0 - for k, v in get_gpu_info()["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) - return capacity +def is_autoscale_enabled(): + return ( + os.getenv("MILABENCH_SIZER_AUTO", False) + or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None + ) -def build_system_config(config_file, defaults=None): +def build_system_config(config_file, defaults=None, gpu=True): """Load the system configuration, verify its validity and resolve ip addresses Notes @@ -204,7 +218,8 @@ def build_system_config(config_file, defaults=None): system = config.get("system", {}) - if "gpu" not in system: + # capacity is only required if batch resizer is enabled + if (gpu or is_autoscale_enabled()) and not "gpu" not in system: system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"} if system.get("sshkey") is not None: diff --git a/milabench/log.py b/milabench/log.py index 5826d309b..a6f7388a9 100644 --- a/milabench/log.py +++ b/milabench/log.py @@ -300,9 +300,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) row["gpu_load"] = f"{load}%" row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB" row["gpu_temp"] = f"{temp}C" @@ -376,9 +376,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) else: task = data.pop("task", "") units = data.pop("units", "") diff --git a/milabench/merge.py b/milabench/merge.py index e5010c629..a9efa4cec 100644 --- a/milabench/merge.py +++ b/milabench/merge.py @@ -1,6 +1,5 @@ """Utilities to merge dictionaries and other data structures.""" - from collections import deque from functools import reduce from typing import Union diff --git a/milabench/scripts/vcs.py b/milabench/scripts/vcs.py index f1a8c4ddf..0f895f886 100644 --- a/milabench/scripts/vcs.py +++ b/milabench/scripts/vcs.py @@ -1,5 +1,6 @@ """Use to retrieve GIT version info, this file cannot import milabench modules as it is executed as part of the installation process""" + import os import subprocess import warnings diff --git a/milabench/sizer.py b/milabench/sizer.py index a2aa8b87b..4ce2a3f22 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -6,7 +6,7 @@ import numpy as np import yaml -from .config import system_global +from .config import is_autoscale_enabled, system_global from .validation.validation import ValidationLayer ROOT = os.path.dirname(__file__) @@ -14,13 +14,6 @@ default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml") -def is_autoscale_enabled(): - return ( - os.getenv("MILABENCH_SIZER_AUTO", False) - or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None - ) - - def getenv(name, type): value = os.getenv(name) @@ -109,6 +102,9 @@ def get_capacity(self, capacity): def auto_size(self, benchmark, capacity): capacity = self.get_capacity(capacity) + if capacity is None: + return None + config = self.benchscaling(benchmark) data = list(sorted(config["model"].items(), key=lambda x: x[0])) @@ -182,7 +178,7 @@ def scale_argv(pack, argv): sizer = sizer_global.get() system = system_global.get() - capacity = system["gpu"]["capacity"] + capacity = system.get("gpu", dict()).get("capacity") return sizer.argv(pack, capacity, argv)