From 29e519b94737824663bb4cbb8d7cbec081059769 Mon Sep 17 00:00:00 2001
From: "pierre.delaunay" <delaunap@rtx5.server.mila.quebec>
Date: Fri, 26 Jul 2024 11:46:54 -0400
Subject: [PATCH] Always set OMP_NUM_THREADS

---
 benchmarks/accelerate_opt/benchfile.py |  6 ----
 benchmarks/diffusion/benchfile.py      |  6 ----
 benchmarks/dinov2/benchfile.py         |  5 +--
 benchmarks/dinov2/dev.yaml             | 10 ------
 benchmarks/lightning/benchfile.py      |  6 ----
 benchmarks/llama/benchfile.py          |  6 ----
 benchmarks/timm/benchfile.py           |  6 ----
 benchmate/benchmate/monitor.py         | 24 +++++++++++---
 config/base.yaml                       | 46 ++++++++++++++++++++++----
 milabench/pack.py                      |  4 +++
 milabench/sizer.py                     |  5 +++
 11 files changed, 68 insertions(+), 56 deletions(-)

diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py
index 746ee5f00..23ef7aba8 100644
--- a/benchmarks/accelerate_opt/benchfile.py
+++ b/benchmarks/accelerate_opt/benchfile.py
@@ -12,12 +12,6 @@
 class AccelerateBenchmark(Package):
     base_requirements = "requirements.in"
 
-    def make_env(self):
-        env = super().make_env()
-        value = self.resolve_argument("--cpus_per_gpu", 8)
-        env["OMP_NUM_THREADS"] = str(value)
-        return env
-
     def build_prepare_plan(self):
         return CmdCommand(
             self,
diff --git a/benchmarks/diffusion/benchfile.py b/benchmarks/diffusion/benchfile.py
index ed2614fbb..2458070ce 100644
--- a/benchmarks/diffusion/benchfile.py
+++ b/benchmarks/diffusion/benchfile.py
@@ -17,12 +17,6 @@ class Diffusion(Package):
 
     # You can remove the functions below if you don't need to modify them.
 
-    def make_env(self):
-        return {
-            **super().make_env(),
-            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
-        }
-
     async def install(self):
         await super().install()  # super() call installs the requirements
         
diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py
index 901c146ec..ddfc4bc06 100644
--- a/benchmarks/dinov2/benchfile.py
+++ b/benchmarks/dinov2/benchfile.py
@@ -28,10 +28,7 @@ def working_directory(self):
     def make_env(self):
         # Return a dict of environment variables for prepare_script and
         # main_script.
-        return {
-            "OMP_NUM_THREADS": str(8),   
-            **super().make_env()
-        }
+        return super().make_env()
 
     async def install(self):
         await super().install()
diff --git a/benchmarks/dinov2/dev.yaml b/benchmarks/dinov2/dev.yaml
index bef609deb..6868b18a6 100644
--- a/benchmarks/dinov2/dev.yaml
+++ b/benchmarks/dinov2/dev.yaml
@@ -11,16 +11,6 @@ _dinov2:
     --output-dir: "{milabench_extra}/output"
     --no-resume: true
 
-dinov2-large:
-  inherits: _dinov2
-  argv:
-    --config-file: src/dinov2/configs/train/vitl14.yaml
-    # THOSE NEED TO BE LAST
-    train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
-    train.batch_size_per_gpu=32: true
-    train.saveckp_freq=100: true
-    train.num_workers=10: true
-
 
 dinov2-giant:
   inherits: _dinov2
diff --git a/benchmarks/lightning/benchfile.py b/benchmarks/lightning/benchfile.py
index 09926711f..8e2a4cf81 100644
--- a/benchmarks/lightning/benchfile.py
+++ b/benchmarks/lightning/benchfile.py
@@ -7,12 +7,6 @@ class LightningBenchmark(Package):
     prepare_script = "prepare.py"
     main_script = "main.py"
 
-    def make_env(self):
-        return {
-            **super().make_env(),
-            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
-        }
-
     def build_run_plan(self):
         # self.config is not the right config for this
         plan = super().build_run_plan()
diff --git a/benchmarks/llama/benchfile.py b/benchmarks/llama/benchfile.py
index b7bc0032e..977e825f5 100644
--- a/benchmarks/llama/benchfile.py
+++ b/benchmarks/llama/benchfile.py
@@ -6,12 +6,6 @@ class LLAMA(Package):
     base_requirements = "requirements.in"
     main_script = "main.py"
 
-    def make_env(self):
-        return {
-            **super().make_env(),
-            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
-        }
-
     async def install(self):
         await super().install()
 
diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py
index 94be19e6b..52a31ba1d 100644
--- a/benchmarks/timm/benchfile.py
+++ b/benchmarks/timm/benchfile.py
@@ -12,12 +12,6 @@ class TimmBenchmarkPack(Package):
     @property
     def working_directory(self):
         return self.dirs.code / "pytorch-image-models"
-    
-    def make_env(self):
-        return {
-            **super().make_env(),
-            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
-        }
 
     @property
     def argv(self):
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index 7064edb72..a2dc2a4a0 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -136,6 +136,13 @@ def milabench_sys_monitor(monogpu=False):
 
 
 
+def get_rank():
+    try:
+        return int(os.getenv("RANK", -1))
+    except:
+        return -1
+
+
 def voirfile_monitor(ov, options):
     from voir.instruments import early_stop, log, dash
 
@@ -148,11 +155,18 @@ def voirfile_monitor(ov, options):
         )
     ] 
 
-    if int(os.getenv("RANK", 0)) == 0:
-        instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))
-        instruments.append(monitor_node(poll_interval=options.gpu_poll))
+    rank = get_rank()
 
-    if os.getenv("RANK", -1) == -1:
+    # -1 & 0 early stop
+    if rank <= 0:
+            instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))
+        
+    # mono gpu if rank is not set
+    if rank == -1:
         instruments.append(monitor_monogpu(poll_interval=options.gpu_poll))
-    
+
+    # rank is set only monitor main rank
+    if rank == 0:
+        instruments.append(monitor_node(poll_interval=options.gpu_poll))
+
     ov.require(*instruments)
diff --git a/config/base.yaml b/config/base.yaml
index e47a78648..9cc2cca97 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -226,7 +226,7 @@ resnet50-noio:
     --batch-size: 256
     --loader: synthetic_fixed
 
-resnet152-ddp:
+resnet152-ddp-gpus:
   inherits: _torchvision_ddp
   tags:
     - vision
@@ -391,7 +391,7 @@ resnet152:
     --model: resnet152
     --batch-size: 256
 
-resnet152-multi:
+resnet152-gpus:
   inherits: resnet152
   tags:
     - multigpu
@@ -427,7 +427,7 @@ davit_large:
     --batch-size: 128
     --lr-base: 0.01
 
-davit_large-multi:
+davit_large-gpus:
   inherits: davit_large
   tags:
     - multigpu
@@ -446,7 +446,7 @@ focalnet:
   argv:
     --model: focalnet_base_lrf
 
-opt-1_3b:
+opt-1_3b-gpus:
   inherits: _accelerate_opt
   tags:
     - multigpu
@@ -458,7 +458,7 @@ opt-1_3b:
   use_deepspeed: false
   num_machines: 1
 
-opt-1_3b-multinode:
+opt-1_3b-nodes:
   inherits: opt-1_3b
 
   tags:
@@ -469,7 +469,7 @@ opt-1_3b-multinode:
   docker_image: "ghcr.io/mila-iqia/milabench:cuda-nightly"
   num_machines: 2
 
-opt-6_7b:
+opt-6_7b-gpus:
   inherits: _accelerate_opt
   tags:
     - multigpu
@@ -480,7 +480,7 @@ opt-6_7b:
 
   num_machines: 1
 
-opt-6_7b-multinode:
+opt-6_7b-nodes:
   inherits: opt-6_7b
   tags:
     - multinode
@@ -693,3 +693,35 @@ lightning-gpus:
   plan:
     method: njobs
     n: 1
+
+_dinov2:
+  inherits: _defaults
+  definition: ../benchmarks/dinov2
+  install_group: torch
+  plan:
+    method: njobs
+    n: 1
+
+  argv:
+    --output-dir: "{milabench_extra}/output"
+    --no-resume: true
+
+dinov2-large-gpus:
+  inherits: _dinov2
+  argv:
+    --config-file: src/dinov2/configs/train/vitl14.yaml
+    # THOSE NEED TO BE LAST
+    train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
+    train.batch_size_per_gpu=32: true
+    train.saveckp_freq=100: true
+    train.num_workers=10: true
+
+dinov2-giant-gpus:
+  inherits: _dinov2
+  argv:
+    --config-file: src/dinov2/configs/train/vitg14.yaml
+    # THOSE NEED TO BE LAST
+    train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
+    train.batch_size_per_gpu=32: true
+    train.saveckp_freq=100: true
+    train.num_workers=10: true
diff --git a/milabench/pack.py b/milabench/pack.py
index 214b4c7e1..60a5df2f7 100644
--- a/milabench/pack.py
+++ b/milabench/pack.py
@@ -329,11 +329,15 @@ def make_env(self):
                 "MILABENCH_CONFIG": json.dumps(self.config),
             }
         """
+        from .sizer import resolve_placeholder
+
         env = {
             f"MILABENCH_DIR_{name.upper()}": path
             for name, path in self.config["dirs"].items()
         }
 
+        env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}")
+
         env["MILABENCH_CONFIG"] = json.dumps(self.config)
         if self.phase == "prepare" or self.phase == "run":
             # XDG_CACHE_HOME controls basically all caches (pip, torch, huggingface,
diff --git a/milabench/sizer.py b/milabench/sizer.py
index cdcb57695..b2d4840f9 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -367,6 +367,11 @@ def auto_eval(arg):
     return auto_eval
 
 
+def resolve_placeholder(pack, value):
+    resolver = new_argument_resolver(pack)
+    return resolver(value)
+
+
 def resolve_argv(pack, argv):
     resolver = new_argument_resolver(pack)
     argv = list(argv)