Skip to content

Commit

Permalink
update capcity resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
Delaunay committed Mar 1, 2024
1 parent 10dfb57 commit 4311da8
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 26 deletions.
9 changes: 5 additions & 4 deletions milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def arguments():

# Define capabilities
capabilities: Option = ""

return CommonArguments(
config,
system,
Expand All @@ -91,7 +91,7 @@ def arguments():
def get_multipack(args = None, run_name=None, overrides={}):
if args is None:
args = arguments()

override = [
o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override
]
Expand Down Expand Up @@ -225,13 +225,14 @@ def _get_multipack(

arch = deduce_arch()
base_defaults = get_base_defaults(
base=args.base,
arch=arch,
base=args.base,
arch=arch,
run_name=run_name
)
system_config = build_system_config(
args.system,
defaults={"system": base_defaults["_defaults"]["system"]},
gpu=True
)
overrides = merge({"*": system_config}, overrides)

Expand Down
28 changes: 21 additions & 7 deletions milabench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,16 +174,29 @@ def resolve_addresses(nodes):
return self


def get_gpu_capacity():
capacity = float("+inf")
def get_gpu_capacity(strict=False):
try:
capacity = 0

for k, v in get_gpu_info()["gpus"].items():
capacity = min(v["memory"]["total"], capacity)

return capacity
except:
print("GPU not available, defaulting to 0 MiB")
if strict:
raise
return 0

for k, v in get_gpu_info()["gpus"].items():
capacity = min(v["memory"]["total"], capacity)

return capacity
def is_autoscale_enabled():
return (
os.getenv("MILABENCH_SIZER_AUTO", False)
or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
)


def build_system_config(config_file, defaults=None):
def build_system_config(config_file, defaults=None, gpu=True):
"""Load the system configuration, verify its validity and resolve ip addresses
Notes
Expand All @@ -204,7 +217,8 @@ def build_system_config(config_file, defaults=None):

system = config.get("system", {})

if "gpu" not in system:
# capacity is only required if batch resizer is enabled
if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"}

if system.get("sshkey") is not None:
Expand Down
12 changes: 6 additions & 6 deletions milabench/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row["gpu_load"] = f"{load}%"
row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
row["gpu_temp"] = f"{temp}C"
Expand Down Expand Up @@ -376,9 +376,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
else:
task = data.pop("task", "")
units = data.pop("units", "")
Expand Down
1 change: 0 additions & 1 deletion milabench/merge.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Utilities to merge dictionaries and other data structures."""


from collections import deque
from functools import reduce
from typing import Union
Expand Down
1 change: 1 addition & 0 deletions milabench/scripts/vcs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Use to retrieve GIT version info, this file cannot import milabench modules
as it is executed as part of the installation process"""

import os
import subprocess
import warnings
Expand Down
9 changes: 1 addition & 8 deletions milabench/sizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,14 @@
import numpy as np
import yaml

from .config import system_global
from .config import system_global, is_autoscale_enabled
from .validation.validation import ValidationLayer

ROOT = os.path.dirname(__file__)

default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml")


def is_autoscale_enabled():
return (
os.getenv("MILABENCH_SIZER_AUTO", False)
or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
)


def getenv(name, type):
value = os.getenv(name)

Expand Down

0 comments on commit 4311da8

Please sign in to comment.