diff --git a/config/slurm.yaml b/config/slurm.yaml new file mode 100644 index 000000000..414c84a2d --- /dev/null +++ b/config/slurm.yaml @@ -0,0 +1,53 @@ +# +# SBatch arguments for different run profile +# + +multi-node-full: + # DGX run: 2 nodes x 8 A100 80Go SXM4 + - --partition=staff-idt + - -w cn-d[003-004] + - --ntasks=1 + - --gpus-per-task=a100l:8 + - --exclusive + - --nodes=2 + - --cpus-per-task=128 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + +single-node-full: + # DGX run: 1 node x 8 A100 80Go SXM4 + - --partition=staff-idt + - -w cn-d[003-004] + - --ntasks=1 + - --gpus-per-task=a100l:8 + - --exclusive + - --nodes=1 + - --cpus-per-task=128 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + +multi-node-small: + # Any GPU, 2 nodes x 2 GPU + - --partition=staff-idt + - --ntasks=1 + - --gpus-per-task=2 + - --exclusive + - --nodes=2 + - --cpus-per-task=16 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=64G + +single-node-small: + # Any GPU, 1 node x 2 GPU + - --partition=staff-idt + - --ntasks=1 + - --gpus-per-task=2 + - --exclusive + - --nodes=1 + - --cpus-per-task=16 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=64G diff --git a/milabench/_version.py b/milabench/_version.py index d9b6bef1c..bdd9c4326 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-12-g39e7cce9" -__commit__ = "39e7cce9aec8a9e1ae7713137f287353ce718875" -__date__ = "2024-06-17 13:41:35 -0400" +__tag__ = "v0.1.0-20-g7246295a" +__commit__ = "7246295a356186b55fa4b2b75480e3700c279b15" +__date__ = "2024-06-20 09:18:17 -0400" diff --git a/milabench/cli/pr.py b/milabench/cli/pr.py index 4a8403dcc..c924ad5b3 100644 --- a/milabench/cli/pr.py +++ b/milabench/cli/pr.py @@ -4,7 +4,7 @@ from coleo import Option, tooled from ..common import _short_make_report -from ..schedule import post_comment_on_pr +from .schedule import post_comment_on_pr # fmt: off diff --git a/milabench/cli/schedule.py b/milabench/cli/schedule.py index f720d366d..26d45d5b2 100644 --- a/milabench/cli/schedule.py +++ b/milabench/cli/schedule.py @@ -5,6 +5,7 @@ import importlib_resources import requests +import yaml from coleo import Option, tooled @@ -14,6 +15,7 @@ class Arguments: sync: bool = False dry : bool = False args: list = field(default_factory=list) + profile: str = None # fmt: on @@ -25,11 +27,29 @@ def arguments(): # Print the command and return without running it dry: Option & bool = False - # pip arguments + # sbatch run profile + profile: Option & str = None + + # script arguments # [remainder] args: Option = [] - return Arguments(sync, dry, args) + return Arguments(sync, dry, args, profile) + + +def get_sbatch_profiles(profile, default): + ROOT = os.path.dirname(__file__) + default_scaling_config = os.path.join(ROOT, "..", "..", "config", "slurm.yaml") + + with open(default_scaling_config, "r") as fp: + sbatch_profiles = yaml.safe_load(fp) + + args = sbatch_profiles.get(profile) + + if args is None: + args = sbatch_profiles.get(default) + + return args @tooled @@ -39,9 +59,9 @@ def cli_schedule(args=None): if args is None: args = arguments() - launch_milabench(args.args, sbatch_args=None, dry=args.dry, sync=args.sync) - + sbatch_args = get_sbatch_profiles(args.profile, "single-node-small") + launch_milabench(args.args, sbatch_args=sbatch_args, dry=args.dry, sync=args.sync) def popen(cmd, callback=None): @@ -120,6 +140,7 @@ class SetupOptions: config: str = "milabench/config/standard.yaml" env: str = "./env" python: str = "3.9" + fun: str = "run" def deduce_remote(self, current_branch): prefix = "refs/heads/" @@ -164,35 +185,25 @@ def arguments(self): self.env, "-p", self.python, + "-f", + self.fun ] def launch_milabench(args, sbatch_args=None, dry: bool = False, sync: bool = False): sbatch_script = ( - importlib_resources.files(__name__) / "scripts" / "milabench_run.bash" + os.path.abspath(importlib_resources.files(__name__) / ".." / "scripts" / "milabench_run.bash") ) sbatch_script = str(sbatch_script) - # salloc --gres=gpu:rtx8000:1 --mem=64G --cpus-per-gpu=4 - - if sbatch_args is None: - sbatch_args = [ - "--ntasks=1", - "--gpus-per-task=rtx8000:2", - "--cpus-per-task=8", - "--time=01:30:00", - "--ntasks-per-node=1", - "--mem=64G", - ] - script_args = SetupOptions() script_args.deduce_from_repository() script_args = script_args.arguments() cmd = sbatch_args + [sbatch_script] + script_args + args + print("sbatch " + " ".join(cmd)) if dry: - print("sbatch " + " ".join(cmd)) code = 0 else: code, _ = sbatch(cmd, sync=sync, tags=None) diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py index 44f306b15..db68dbf0e 100644 --- a/milabench/cli/slurm.py +++ b/milabench/cli/slurm.py @@ -2,9 +2,8 @@ import os from coleo import tooled -from voir.instruments.gpu import get_gpu_info -from ..slurm import expand_node_list +from ..system import get_gpu_capacity @tooled @@ -26,18 +25,73 @@ def make_node(i, ip): return node - capacity = float("+inf") - - for _, v in get_gpu_info("cuda")["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) - # nvidia-smi --query-gpu=memory.total --format=csv system = { "arch": "cuda", - "gpu": {"capacity": f"{int(capacity)} MiB"}, "nodes": [make_node(i, ip) for i, ip in enumerate(node_list)], } + capacity = get_gpu_capacity() + if capacity > 0: + system["gpu"] = { + "capacity": f"{capacity} MiB" + } + import yaml print(yaml.dump({"system": system})) + + +def expand_range(s): + numbers = [] + count = 0 + + for i in s.split(","): + if "-" not in i: + count = len(i) + numbers.append(i) + else: + start, end = i.split("-") + count = len(start) + + for n in range(int(start), int(end) + 1): + numbers.append(f"{n:0{count}d}") + + return numbers + + +def expand_node_list(node_list): + nodes = [] + s = 0 + + while s < len(node_list): + if node_list[s] == ",": + s += 1 + + next = node_list.find(",", s) + range_start = node_list.find("[", s) + range_end = node_list.find("]", s) + + # Found a range + if range_start != -1 and (next == -1 or range_start < next): + node_name = node_list[s:range_start] + + range = node_list[range_start + 1 : range_end] + + for i in expand_range(range): + nodes.append(f"{node_name}{i}") + + # eat the ] + s = range_end + 1 + + else: + if next == -1: + next = len(node_list) + + node_name = node_list[s:next] + nodes.append(node_name) + + # eat the , + s = next + 1 + + return nodes diff --git a/milabench/config.py b/milabench/config.py index cee0a98a8..2003c3af6 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -26,7 +26,7 @@ def get_run_count(): def get_base_folder(): config = config_global.get() - return XPath(config["dirs"]["base"]) + return XPath(config["_defaults"]["dirs"]["base"]) def relative_to(pth, cwd): pth = XPath(pth).expanduser() diff --git a/milabench/schedule.py b/milabench/schedule.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/milabench/scripts/milabench_run.bash b/milabench/scripts/milabench_run.bash index e37200ab3..52241c0ff 100755 --- a/milabench/scripts/milabench_run.bash +++ b/milabench/scripts/milabench_run.bash @@ -7,7 +7,7 @@ function usage() { echo "Usage: $0 [-m] [-p]" echo " -h Display this help message." - echo " -b arch GPU arch (default: cuda)" + echo " -a arch GPU arch (default: cuda)" echo " -b BRANCH Branch to checkout (default: master)" echo " -o ORIGIN Origin to use (default: github/mila/milabench)" echo " -c CONFIG Configuration (default: milabench/config/standard.yaml)" @@ -17,64 +17,70 @@ function usage() { exit 1 } -function parse_args() { - ARCH="cuda" - PYTHON="3.9" - BRANCH="master" - ORIGIN="https://github.com/mila-iqia/milabench.git" - LOC="$SLURM_TMPDIR" - CONFIG="$LOC/milabench/config/standard.yaml" - BASE="$LOC/base" - ENV="./env" - REMAINING_ARGS="" - - while getopts ":hm:p:e:b:o:c:" opt; do - case $opt in - h) +ARCH="cuda" +PYTHON="3.10" +BRANCH="master" +ORIGIN="https://github.com/mila-iqia/milabench.git" +LOC="$SLURM_TMPDIR/$SLURM_JOB_ID" +CONFIG="$LOC/milabench/config/standard.yaml" +BASE="$LOC/base" +ENV="./env" +REMAINING_ARGS="" +FUN="run" + +while getopts ":hm:p:e:b:o:c:f:" opt; do + case $opt in + h) + usage + ;; + f) + FUN="$OPTARG" + ;; + p) + PYTHON="$OPTARG" + ;; + b) + BRANCH="$OPTARG" + ;; + o) + ORIGIN="$OPTARG" + ;; + c) + CONFIG="$OPTARG" + ;; + e) + ENV="$OPTARG" + ;; + a) + ARCH="$OPTARG" + ;; + l) + # FIX ME + LOC="$OPTARG" + CONFIG="$LOC/milabench/config/standard.yaml" + BASE="$LOC/base" + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 usage ;; - p) - PYTHON="$OPTARG" - ;; - b) - BRANCH="$OPTARG" - ;; - o) - ORIGIN="$OPTARG" - ;; - c) - CONFIG="$OPTARG" - ;; - e) - ENV="$OPTARG" - ;; - a) - ARCH="$OPTARG" - ;; - l) - # FIX ME - LOC="$OPTARG" - CONFIG="$LOC/milabench/config/standard.yaml" - BASE="$LOC/base" - ;; - :) - echo "Option -$OPTARG requires an argument." >&2 - usage - ;; - esac - done - - shift "$((OPTIND-1))" - REMAINING_ARGS="$@" - - echo " PYTHON: $PYTHON" - echo " branch: $BRANCH" - echo " origin: $ORIGIN" - echo " config: $CONFIG" - echo " env: $ENV" - echo " args: $REMAINING_ARGS" + esac +done + +shift "$((OPTIND-1))" +REMAINING_ARGS="$@" + +echo " PYTHON: $PYTHON" +echo " branch: $BRANCH" +echo " origin: $ORIGIN" +echo " config: $CONFIG" +echo " env: $ENV" +echo " args: $REMAINING_ARGS" +echo " loc: $LOC" + +mkdir -p $LOC +cd $LOC -} function conda_env() { # @@ -112,15 +118,17 @@ function setup() { # # Fetch the repo # + cd $LOC git clone --single-branch --depth 1 -b $BRANCH $ORIGIN python -m pip install -e ./milabench - + ( + cd milabench + git status + ) SYSTEM="$LOC/system.yaml" } function pin() { - parse_args - conda_env setup @@ -138,8 +146,7 @@ function pin() { } function run() { - parse_args - + conda_env setup @@ -148,8 +155,8 @@ function run() { echo "System" echo "------" - milabench slurm_system milabench slurm_system > $SYSTEM + cat $SYSTEM module load gcc/9.3.0 module load cuda/11.8 @@ -178,4 +185,13 @@ function run() { echo "----" echo "Done after $SECONDS" echo "" -} \ No newline at end of file +} + +case "$FUN" in + run) + run + ;; + pin) + pin + ;; +esac \ No newline at end of file diff --git a/milabench/slurm.py b/milabench/slurm.py deleted file mode 100644 index cadf0f73f..000000000 --- a/milabench/slurm.py +++ /dev/null @@ -1,53 +0,0 @@ -def expand_range(s): - numbers = [] - count = 0 - - for i in s.split(","): - if "-" not in i: - count = len(i) - numbers.append(i) - else: - start, end = i.split("-") - count = len(start) - - for n in range(int(start), int(end) + 1): - numbers.append(f"{n:0{count}d}") - - return numbers - - -def expand_node_list(node_list): - nodes = [] - s = 0 - - while s < len(node_list): - if node_list[s] == ",": - s += 1 - - next = node_list.find(",", s) - range_start = node_list.find("[", s) - range_end = node_list.find("]", s) - - # Found a range - if range_start != -1 and (next == -1 or range_start < next): - node_name = node_list[s:range_start] - - range = node_list[range_start + 1 : range_end] - - for i in expand_range(range): - nodes.append(f"{node_name}{i}") - - # eat the ] - s = range_end + 1 - - else: - if next == -1: - next = len(node_list) - - node_name = node_list[s:next] - nodes.append(node_name) - - # eat the , - s = next + 1 - - return nodes diff --git a/milabench/system.py b/milabench/system.py index 45379e25f..6edbba93d 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -2,6 +2,7 @@ import os import socket from dataclasses import dataclass, field +import sys import psutil import yaml @@ -30,7 +31,7 @@ def print_once(*args, **kwargs): def _print(): nonlocal printed if printed == 0: - print(*args, **kwargs) + print(*args, **kwargs, file=sys.stderr) printed += 1 return _print @@ -181,6 +182,19 @@ def _resolve_ip(ip): return hostname, aliaslist, ipaddrlist, lazy_raise +def _fix_weird(hostname): + if hostname.endswith(".server.mila.quebec.server.mila.quebec"): + print() + print("Hostname was extra long for no reason") + print(hostname, socket.gethostname()) + print() + + # why is this happening + hostname = hostname[: -len(".server.mila.quebec")] + + return hostname + + def resolve_addresses(nodes): # Note: it is possible for self to be none # if we are running milabench on a node that is not part of the system @@ -193,24 +207,20 @@ def resolve_addresses(nodes): for node in nodes: hostname, aliaslist, ipaddrlist, lazy_raise = _resolve_ip(node["ip"]) + hostname = _fix_weird(hostname) + node["hostname"] = hostname node["aliaslist"] = aliaslist node["ipaddrlist"] = ipaddrlist - if hostname.endswith(".server.mila.quebec.server.mila.quebec"): - print() - print("Hostname was extra long for no reason") - print(hostname, socket.gethostname()) - print() - - # why is this happening - hostname = hostname[: -len(".server.mila.quebec")] - is_local = ( ("127.0.0.1" in ipaddrlist) or (hostname in ("localhost", socket.gethostname())) + or (socket.gethostname().startswith(hostname)) or len(ip_list.intersection(ipaddrlist)) > 0 ) + # cn-g005 cn-g005.server.mila.quebec + # print(hostname, socket.gethostname()) node["local"] = is_local if is_local: @@ -232,7 +242,7 @@ def get_gpu_capacity(strict=False): for k, v in get_gpu_info()["gpus"].items(): capacity = min(v["memory"]["total"], capacity) - return capacity + return int(capacity) except: print("GPU not available, defaulting to 0 MiB") if strict: