From b67bf0d3b7975a79f3ca270ed37efae35faf3e8a Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 20 Jun 2024 09:47:45 -0400 Subject: [PATCH] Add new slurm sbatch profile --- config/slurm.yaml | 53 ++++++++++++++++++++++++++++ milabench/_version.py | 6 ++-- milabench/cli/schedule.py | 41 +++++++++++++-------- milabench/schedule.py | 0 milabench/scripts/milabench_run.bash | 23 ++++++++---- 5 files changed, 99 insertions(+), 24 deletions(-) create mode 100644 config/slurm.yaml delete mode 100644 milabench/schedule.py diff --git a/config/slurm.yaml b/config/slurm.yaml new file mode 100644 index 000000000..414c84a2d --- /dev/null +++ b/config/slurm.yaml @@ -0,0 +1,53 @@ +# +# SBatch arguments for different run profile +# + +multi-node-full: + # DGX run: 2 nodes x 8 A100 80Go SXM4 + - --partition=staff-idt + - -w cn-d[003-004] + - --ntasks=1 + - --gpus-per-task=a100l:8 + - --exclusive + - --nodes=2 + - --cpus-per-task=128 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + +single-node-full: + # DGX run: 1 node x 8 A100 80Go SXM4 + - --partition=staff-idt + - -w cn-d[003-004] + - --ntasks=1 + - --gpus-per-task=a100l:8 + - --exclusive + - --nodes=1 + - --cpus-per-task=128 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=0 + +multi-node-small: + # Any GPU, 2 nodes x 2 GPU + - --partition=staff-idt + - --ntasks=1 + - --gpus-per-task=2 + - --exclusive + - --nodes=2 + - --cpus-per-task=16 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=64G + +single-node-small: + # Any GPU, 1 node x 2 GPU + - --partition=staff-idt + - --ntasks=1 + - --gpus-per-task=2 + - --exclusive + - --nodes=1 + - --cpus-per-task=16 + - --time=1:30:00 + - --ntasks-per-node=1 + - --mem=64G diff --git a/milabench/_version.py b/milabench/_version.py index d9b6bef1c..bdd9c4326 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-12-g39e7cce9" -__commit__ = "39e7cce9aec8a9e1ae7713137f287353ce718875" -__date__ = "2024-06-17 13:41:35 -0400" +__tag__ = "v0.1.0-20-g7246295a" +__commit__ = "7246295a356186b55fa4b2b75480e3700c279b15" +__date__ = "2024-06-20 09:18:17 -0400" diff --git a/milabench/cli/schedule.py b/milabench/cli/schedule.py index f720d366d..95f3c306e 100644 --- a/milabench/cli/schedule.py +++ b/milabench/cli/schedule.py @@ -5,6 +5,7 @@ import importlib_resources import requests +import yaml from coleo import Option, tooled @@ -14,6 +15,7 @@ class Arguments: sync: bool = False dry : bool = False args: list = field(default_factory=list) + profile: str = None # fmt: on @@ -25,11 +27,29 @@ def arguments(): # Print the command and return without running it dry: Option & bool = False + # sbatch run profile + profile: Option & str = None + # pip arguments # [remainder] args: Option = [] - return Arguments(sync, dry, args) + return Arguments(sync, dry, args, profile) + + +def get_sbatch_profiles(profile, default): + ROOT = os.path.dirname(__file__) + default_scaling_config = os.path.join(ROOT, "..", "..", "config", "slurm.yaml") + + with open(default_scaling_config, "r") as fp: + sbatch_profiles = yaml.safe_load(fp) + + args = sbatch_profiles.get(profile) + + if args is None: + args = sbatch_profiles.get(default) + + return args @tooled @@ -39,9 +59,9 @@ def cli_schedule(args=None): if args is None: args = arguments() - launch_milabench(args.args, sbatch_args=None, dry=args.dry, sync=args.sync) - + sbatch_args = get_sbatch_profiles(args.profile, "single-node-small") + launch_milabench(args.args, sbatch_args=sbatch_args, dry=args.dry, sync=args.sync) def popen(cmd, callback=None): @@ -120,6 +140,7 @@ class SetupOptions: config: str = "milabench/config/standard.yaml" env: str = "./env" python: str = "3.9" + fun: str = "run" def deduce_remote(self, current_branch): prefix = "refs/heads/" @@ -164,6 +185,8 @@ def arguments(self): self.env, "-p", self.python, + "-f", + self.fun ] @@ -173,18 +196,6 @@ def launch_milabench(args, sbatch_args=None, dry: bool = False, sync: bool = Fal ) sbatch_script = str(sbatch_script) - # salloc --gres=gpu:rtx8000:1 --mem=64G --cpus-per-gpu=4 - - if sbatch_args is None: - sbatch_args = [ - "--ntasks=1", - "--gpus-per-task=rtx8000:2", - "--cpus-per-task=8", - "--time=01:30:00", - "--ntasks-per-node=1", - "--mem=64G", - ] - script_args = SetupOptions() script_args.deduce_from_repository() script_args = script_args.arguments() diff --git a/milabench/schedule.py b/milabench/schedule.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/milabench/scripts/milabench_run.bash b/milabench/scripts/milabench_run.bash index e37200ab3..2abefd735 100755 --- a/milabench/scripts/milabench_run.bash +++ b/milabench/scripts/milabench_run.bash @@ -27,12 +27,16 @@ function parse_args() { BASE="$LOC/base" ENV="./env" REMAINING_ARGS="" + FUN="run" - while getopts ":hm:p:e:b:o:c:" opt; do + while getopts ":hm:p:e:b:o:c:f:" opt; do case $opt in h) usage ;; + f) + FUN="$OPTARG" + ;; p) PYTHON="$OPTARG" ;; @@ -119,8 +123,6 @@ function setup() { } function pin() { - parse_args - conda_env setup @@ -138,8 +140,6 @@ function pin() { } function run() { - parse_args - conda_env setup @@ -178,4 +178,15 @@ function run() { echo "----" echo "Done after $SECONDS" echo "" -} \ No newline at end of file +} + +parse_args + +case "$FUN" in + run) + run + ;; + pin) + pin + ;; +esac \ No newline at end of file