From d2f7caac43060ff0960c3e99e56545f78f3a027d Mon Sep 17 00:00:00 2001
From: "pierre.delaunay" <delaunap@rtx5.server.mila.quebec>
Date: Thu, 20 Jun 2024 09:47:45 -0400
Subject: [PATCH] Add new slurm sbatch profile

---
 config/slurm.yaml                    |  53 ++++++++++
 milabench/_version.py                |   6 +-
 milabench/cli/pr.py                  |   2 +-
 milabench/cli/schedule.py            |  47 +++++----
 milabench/cli/slurm.py               |  70 +++++++++++--
 milabench/schedule.py                |   0
 milabench/scripts/milabench_run.bash | 142 +++++++++++++++------------
 milabench/slurm.py                   |  53 ----------
 milabench/system.py                  |  32 +++---
 9 files changed, 248 insertions(+), 157 deletions(-)
 create mode 100644 config/slurm.yaml
 delete mode 100644 milabench/schedule.py
 delete mode 100644 milabench/slurm.py

diff --git a/config/slurm.yaml b/config/slurm.yaml
new file mode 100644
index 000000000..414c84a2d
--- /dev/null
+++ b/config/slurm.yaml
@@ -0,0 +1,53 @@
+#
+# SBatch arguments for different run profile
+#
+
+multi-node-full:
+  # DGX run: 2 nodes x 8 A100 80Go SXM4
+  - --partition=staff-idt
+  - -w cn-d[003-004]
+  - --ntasks=1
+  - --gpus-per-task=a100l:8
+  - --exclusive
+  - --nodes=2
+  - --cpus-per-task=128
+  - --time=1:30:00
+  - --ntasks-per-node=1
+  - --mem=0
+
+single-node-full:
+  # DGX run: 1 node x 8 A100 80Go SXM4
+  - --partition=staff-idt
+  - -w cn-d[003-004]
+  - --ntasks=1
+  - --gpus-per-task=a100l:8
+  - --exclusive
+  - --nodes=1
+  - --cpus-per-task=128
+  - --time=1:30:00
+  - --ntasks-per-node=1
+  - --mem=0
+
+multi-node-small:
+  # Any GPU, 2 nodes x 2 GPU
+  - --partition=staff-idt
+  - --ntasks=1
+  - --gpus-per-task=2
+  - --exclusive
+  - --nodes=2
+  - --cpus-per-task=16
+  - --time=1:30:00
+  - --ntasks-per-node=1
+  - --mem=64G
+
+single-node-small:
+  # Any GPU, 1 node x 2 GPU
+  - --partition=staff-idt
+  - --ntasks=1
+  - --gpus-per-task=2
+  - --exclusive
+  - --nodes=1
+  - --cpus-per-task=16
+  - --time=1:30:00
+  - --ntasks-per-node=1
+  - --mem=64G
diff --git a/milabench/_version.py b/milabench/_version.py
index d9b6bef1c..bdd9c4326 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-12-g39e7cce9"
-__commit__ = "39e7cce9aec8a9e1ae7713137f287353ce718875"
-__date__ = "2024-06-17 13:41:35 -0400"
+__tag__ = "v0.1.0-20-g7246295a"
+__commit__ = "7246295a356186b55fa4b2b75480e3700c279b15"
+__date__ = "2024-06-20 09:18:17 -0400"
diff --git a/milabench/cli/pr.py b/milabench/cli/pr.py
index 4a8403dcc..c924ad5b3 100644
--- a/milabench/cli/pr.py
+++ b/milabench/cli/pr.py
@@ -4,7 +4,7 @@
 from coleo import Option, tooled
 
 from ..common import _short_make_report
-from ..schedule import post_comment_on_pr
+from .schedule import post_comment_on_pr
 
 
 # fmt: off
diff --git a/milabench/cli/schedule.py b/milabench/cli/schedule.py
index f720d366d..26d45d5b2 100644
--- a/milabench/cli/schedule.py
+++ b/milabench/cli/schedule.py
@@ -5,6 +5,7 @@
 
 import importlib_resources
 import requests
+import yaml
 from coleo import Option, tooled
 
 
@@ -14,6 +15,7 @@ class Arguments:
     sync: bool = False
     dry : bool = False
     args: list = field(default_factory=list)
+    profile: str = None
 # fmt: on
 
 
@@ -25,11 +27,29 @@ def arguments():
     # Print the command and return without running it
     dry: Option & bool = False
 
-    # pip arguments
+    # sbatch run profile
+    profile: Option & str = None
+
+    # script arguments
     # [remainder]
     args: Option = []
 
-    return Arguments(sync, dry, args)
+    return Arguments(sync, dry, args, profile)
+
+
+def get_sbatch_profiles(profile, default):
+    ROOT = os.path.dirname(__file__)
+    default_scaling_config = os.path.join(ROOT, "..", "..", "config", "slurm.yaml")
+
+    with open(default_scaling_config, "r") as fp:
+        sbatch_profiles = yaml.safe_load(fp)
+
+    args = sbatch_profiles.get(profile)
+
+    if args is None:
+        args = sbatch_profiles.get(default)
+    
+    return args 
 
 
 @tooled
@@ -39,9 +59,9 @@ def cli_schedule(args=None):
     if args is None:
         args = arguments()
 
-    launch_milabench(args.args, sbatch_args=None, dry=args.dry, sync=args.sync)
-
+    sbatch_args = get_sbatch_profiles(args.profile, "single-node-small")
 
+    launch_milabench(args.args, sbatch_args=sbatch_args, dry=args.dry, sync=args.sync)
 
 
 def popen(cmd, callback=None):
@@ -120,6 +140,7 @@ class SetupOptions:
     config: str = "milabench/config/standard.yaml"
     env: str = "./env"
     python: str = "3.9"
+    fun: str = "run"
 
     def deduce_remote(self, current_branch):
         prefix = "refs/heads/"
@@ -164,35 +185,25 @@ def arguments(self):
             self.env,
             "-p",
             self.python,
+            "-f",
+            self.fun
         ]
 
 
 def launch_milabench(args, sbatch_args=None, dry: bool = False, sync: bool = False):
     sbatch_script = (
-        importlib_resources.files(__name__) / "scripts" / "milabench_run.bash"
+        os.path.abspath(importlib_resources.files(__name__) / ".." / "scripts" / "milabench_run.bash")
     )
     sbatch_script = str(sbatch_script)
 
-    # salloc --gres=gpu:rtx8000:1 --mem=64G --cpus-per-gpu=4
-
-    if sbatch_args is None:
-        sbatch_args = [
-            "--ntasks=1",
-            "--gpus-per-task=rtx8000:2",
-            "--cpus-per-task=8",
-            "--time=01:30:00",
-            "--ntasks-per-node=1",
-            "--mem=64G",
-        ]
-
     script_args = SetupOptions()
     script_args.deduce_from_repository()
     script_args = script_args.arguments()
 
     cmd = sbatch_args + [sbatch_script] + script_args + args
+    print("sbatch " + " ".join(cmd))
 
     if dry:
-        print("sbatch " + " ".join(cmd))
         code = 0
     else:
         code, _ = sbatch(cmd, sync=sync, tags=None)
diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py
index 44f306b15..db68dbf0e 100644
--- a/milabench/cli/slurm.py
+++ b/milabench/cli/slurm.py
@@ -2,9 +2,8 @@
 import os
 
 from coleo import tooled
-from voir.instruments.gpu import get_gpu_info
 
-from ..slurm import expand_node_list
+from ..system import get_gpu_capacity
 
 
 @tooled
@@ -26,18 +25,73 @@ def make_node(i, ip):
 
         return node
 
-    capacity = float("+inf")
-
-    for _, v in get_gpu_info("cuda")["gpus"].items():
-        capacity = min(v["memory"]["total"], capacity)
-
     # nvidia-smi --query-gpu=memory.total --format=csv
     system = {
         "arch": "cuda",
-        "gpu": {"capacity": f"{int(capacity)} MiB"},
         "nodes": [make_node(i, ip) for i, ip in enumerate(node_list)],
     }
 
+    capacity = get_gpu_capacity()
+    if capacity > 0:
+        system["gpu"] = {
+            "capacity": f"{capacity} MiB"
+        }
+
     import yaml
 
     print(yaml.dump({"system": system}))
+
+
+def expand_range(s):
+    numbers = []
+    count = 0
+
+    for i in s.split(","):
+        if "-" not in i:
+            count = len(i)
+            numbers.append(i)
+        else:
+            start, end = i.split("-")
+            count = len(start)
+
+            for n in range(int(start), int(end) + 1):
+                numbers.append(f"{n:0{count}d}")
+
+    return numbers
+
+
+def expand_node_list(node_list):
+    nodes = []
+    s = 0
+
+    while s < len(node_list):
+        if node_list[s] == ",":
+            s += 1
+
+        next = node_list.find(",", s)
+        range_start = node_list.find("[", s)
+        range_end = node_list.find("]", s)
+
+        # Found a range
+        if range_start != -1 and (next == -1 or range_start < next):
+            node_name = node_list[s:range_start]
+
+            range = node_list[range_start + 1 : range_end]
+
+            for i in expand_range(range):
+                nodes.append(f"{node_name}{i}")
+
+            # eat the ]
+            s = range_end + 1
+
+        else:
+            if next == -1:
+                next = len(node_list)
+
+            node_name = node_list[s:next]
+            nodes.append(node_name)
+
+            # eat the ,
+            s = next + 1
+
+    return nodes
diff --git a/milabench/schedule.py b/milabench/schedule.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/milabench/scripts/milabench_run.bash b/milabench/scripts/milabench_run.bash
index e37200ab3..bc4b9c516 100755
--- a/milabench/scripts/milabench_run.bash
+++ b/milabench/scripts/milabench_run.bash
@@ -7,7 +7,7 @@
 function usage() {
   echo "Usage: $0 [-m] [-p]"
   echo "  -h              Display this help message."
-  echo "  -b arch         GPU arch           (default: cuda)"
+  echo "  -a arch         GPU arch           (default: cuda)"
   echo "  -b BRANCH       Branch to checkout (default: master)"
   echo "  -o ORIGIN       Origin to use      (default: github/mila/milabench)"
   echo "  -c CONFIG       Configuration      (default: milabench/config/standard.yaml)"
@@ -17,64 +17,70 @@ function usage() {
   exit 1
 }
 
-function parse_args() {
-  ARCH="cuda"
-  PYTHON="3.9"
-  BRANCH="master"
-  ORIGIN="https://github.com/mila-iqia/milabench.git"
-  LOC="$SLURM_TMPDIR"
-  CONFIG="$LOC/milabench/config/standard.yaml"
-  BASE="$LOC/base"
-  ENV="./env"
-  REMAINING_ARGS=""
-
-  while getopts ":hm:p:e:b:o:c:" opt; do
-    case $opt in
-      h)
+ARCH="cuda"
+PYTHON="3.9"
+BRANCH="master"
+ORIGIN="https://github.com/mila-iqia/milabench.git"
+LOC="$SLURM_TMPDIR/$SLURM_JOB_ID"
+CONFIG="$LOC/milabench/config/standard.yaml"
+BASE="$LOC/base"
+ENV="./env"
+REMAINING_ARGS=""
+FUN="run"
+
+while getopts ":hm:p:e:b:o:c:f:" opt; do
+  case $opt in
+    h)
+      usage
+      ;;
+    f)
+      FUN="$OPTARG"
+      ;;
+    p)
+        PYTHON="$OPTARG"
+        ;;
+    b)
+        BRANCH="$OPTARG"
+        ;;
+    o)
+        ORIGIN="$OPTARG"
+        ;;
+    c)
+        CONFIG="$OPTARG"
+        ;;
+    e)
+        ENV="$OPTARG"
+        ;;
+    a)
+        ARCH="$OPTARG"
+        ;;
+    l)
+        # FIX ME
+        LOC="$OPTARG"
+        CONFIG="$LOC/milabench/config/standard.yaml"
+        BASE="$LOC/base"
+        ;;
+    :)
+        echo "Option -$OPTARG requires an argument." >&2
         usage
         ;;
-      p)
-          PYTHON="$OPTARG"
-          ;;
-      b)
-          BRANCH="$OPTARG"
-          ;;
-      o)
-          ORIGIN="$OPTARG"
-          ;;
-      c)
-          CONFIG="$OPTARG"
-          ;;
-      e)
-          ENV="$OPTARG"
-          ;;
-      a)
-          ARCH="$OPTARG"
-          ;;
-      l)
-          # FIX ME
-          LOC="$OPTARG"
-          CONFIG="$LOC/milabench/config/standard.yaml"
-          BASE="$LOC/base"
-          ;;
-      :)
-          echo "Option -$OPTARG requires an argument." >&2
-          usage
-          ;;
-    esac
-  done
-
-  shift "$((OPTIND-1))"
-  REMAINING_ARGS="$@"
-
-  echo "  PYTHON: $PYTHON"
-  echo "  branch: $BRANCH"
-  echo "  origin: $ORIGIN"
-  echo "  config: $CONFIG"
-  echo "     env: $ENV"
-  echo "    args: $REMAINING_ARGS"
+  esac
+done
+
+shift "$((OPTIND-1))"
+REMAINING_ARGS="$@"
+
+echo "  PYTHON: $PYTHON"
+echo "  branch: $BRANCH"
+echo "  origin: $ORIGIN"
+echo "  config: $CONFIG"
+echo "     env: $ENV"
+echo "    args: $REMAINING_ARGS"
+echo "     loc: $LOC"
+
+mkdir -p $LOC
+cd $LOC
 
-}
 
 function conda_env() {
   #
@@ -112,15 +118,17 @@ function setup() {
   #
   # Fetch the repo
   #
+  cd $LOC
   git clone --single-branch --depth 1 -b $BRANCH $ORIGIN
   python -m pip install -e ./milabench
-
+  (
+    cd milabench
+    git status
+  )
   SYSTEM="$LOC/system.yaml"
 }
 
 function pin() {
-  parse_args
-
   conda_env
 
   setup
@@ -138,8 +146,7 @@ function pin() {
 }
 
 function run() {
-  parse_args
-
+  
   conda_env
 
   setup
@@ -148,8 +155,8 @@ function run() {
   echo "System"
   echo "------"
 
-  milabench slurm_system 
   milabench slurm_system > $SYSTEM
+  cat $SYSTEM
 
   module load gcc/9.3.0 
   module load cuda/11.8
@@ -178,4 +185,13 @@ function run() {
   echo "----"
   echo "Done after $SECONDS"
   echo ""
-}
\ No newline at end of file
+}
+
+case "$FUN" in
+  run)
+    run
+    ;;
+  pin)
+    pin
+    ;;
+esac
\ No newline at end of file
diff --git a/milabench/slurm.py b/milabench/slurm.py
deleted file mode 100644
index cadf0f73f..000000000
--- a/milabench/slurm.py
+++ /dev/null
@@ -1,53 +0,0 @@
-def expand_range(s):
-    numbers = []
-    count = 0
-
-    for i in s.split(","):
-        if "-" not in i:
-            count = len(i)
-            numbers.append(i)
-        else:
-            start, end = i.split("-")
-            count = len(start)
-
-            for n in range(int(start), int(end) + 1):
-                numbers.append(f"{n:0{count}d}")
-
-    return numbers
-
-
-def expand_node_list(node_list):
-    nodes = []
-    s = 0
-
-    while s < len(node_list):
-        if node_list[s] == ",":
-            s += 1
-
-        next = node_list.find(",", s)
-        range_start = node_list.find("[", s)
-        range_end = node_list.find("]", s)
-
-        # Found a range
-        if range_start != -1 and (next == -1 or range_start < next):
-            node_name = node_list[s:range_start]
-
-            range = node_list[range_start + 1 : range_end]
-
-            for i in expand_range(range):
-                nodes.append(f"{node_name}{i}")
-
-            # eat the ]
-            s = range_end + 1
-
-        else:
-            if next == -1:
-                next = len(node_list)
-
-            node_name = node_list[s:next]
-            nodes.append(node_name)
-
-            # eat the ,
-            s = next + 1
-
-    return nodes
diff --git a/milabench/system.py b/milabench/system.py
index 45379e25f..a50c4ab82 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -2,6 +2,7 @@
 import os
 import socket
 from dataclasses import dataclass, field
+import sys
 
 import psutil
 import yaml
@@ -30,7 +31,7 @@ def print_once(*args, **kwargs):
     def _print():
         nonlocal printed
         if printed == 0:
-            print(*args, **kwargs)
+            print(*args, **kwargs, file=sys.stderr)
             printed += 1
 
     return _print
@@ -181,6 +182,19 @@ def _resolve_ip(ip):
     return hostname, aliaslist, ipaddrlist, lazy_raise
 
 
+def _fix_weird(hostname):
+    if hostname.endswith(".server.mila.quebec.server.mila.quebec"):
+        print()
+        print("Hostname was extra long for no reason")
+        print(hostname, socket.gethostname())
+        print()
+
+        # why is this happening
+        hostname = hostname[: -len(".server.mila.quebec")]
+    
+    return hostname
+
+
 def resolve_addresses(nodes):
     # Note: it is possible for self to be none
     # if we are running milabench on a node that is not part of the system
@@ -193,24 +207,20 @@ def resolve_addresses(nodes):
     for node in nodes:
         hostname, aliaslist, ipaddrlist, lazy_raise = _resolve_ip(node["ip"])
 
+        hostname = _fix_weird(hostname)
+
         node["hostname"] = hostname
         node["aliaslist"] = aliaslist
         node["ipaddrlist"] = ipaddrlist
 
-        if hostname.endswith(".server.mila.quebec.server.mila.quebec"):
-            print()
-            print("Hostname was extra long for no reason")
-            print(hostname, socket.gethostname())
-            print()
-
-            # why is this happening
-            hostname = hostname[: -len(".server.mila.quebec")]
-
         is_local = (
             ("127.0.0.1" in ipaddrlist)
             or (hostname in ("localhost", socket.gethostname()))
+            or (socket.gethostname().startswith(hostname))
             or len(ip_list.intersection(ipaddrlist)) > 0
         )
+        # cn-g005 cn-g005.server.mila.quebec
+        print(hostname, socket.gethostname())
         node["local"] = is_local
 
         if is_local:
@@ -232,7 +242,7 @@ def get_gpu_capacity(strict=False):
         for k, v in get_gpu_info()["gpus"].items():
             capacity = min(v["memory"]["total"], capacity)
 
-        return capacity
+        return int(capacity)
     except:
         print("GPU not available, defaulting to 0 MiB")
         if strict: