From 67047d6f4634ac37ad861b53e35afb088a05dcc2 Mon Sep 17 00:00:00 2001
From: "pierre.delaunay" <delaunap@rtx5.server.mila.quebec>
Date: Tue, 23 Jul 2024 13:58:00 -0400
Subject: [PATCH] Fix TIMM benchmarks

---
 benchmarks/timm/benchfile.py    | 11 +++++--
 benchmarks/timm/voirfile.py     |  9 ++----
 benchmate/benchmate/benchrun.py | 53 +++++++++++++++++++++++++++++++++
 benchmate/benchmate/metrics.py  |  1 +
 benchmate/pyproject.toml        |  4 +++
 milabench/commands/__init__.py  | 53 +++++++++++++++++++++++----------
 milabench/commands/executors.py |  2 ++
 scripts/article/run_cuda_dev.sh |  2 ++
 8 files changed, 111 insertions(+), 24 deletions(-)
 create mode 100644 benchmate/benchmate/benchrun.py

diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py
index cb93787d6..7e9220046 100644
--- a/benchmarks/timm/benchfile.py
+++ b/benchmarks/timm/benchfile.py
@@ -39,9 +39,14 @@ async def install(self):
         await super().install()
 
     def build_run_plan(self):
-        # self.config is not the right config for this
-        plan = super().build_run_plan()
-        return TorchRunCommand(plan, use_stdout=True)
+        import milabench.commands as cmd
+        main = self.dirs.code / self.main_script
+
+        # torchrun ... -m voir ... train_script ...
+        return TorchRunCommand(
+            cmd.VoirCommand(cmd.PackCommand(self, *self.argv, lazy=True), cwd=main.parent, module=True),
+            module=True
+        )
 
 
 __pack__ = TimmBenchmarkPack
diff --git a/benchmarks/timm/voirfile.py b/benchmarks/timm/voirfile.py
index 96922d144..b1b33cc49 100644
--- a/benchmarks/timm/voirfile.py
+++ b/benchmarks/timm/voirfile.py
@@ -35,16 +35,13 @@ def instrument_main(ov, options: Config):
     import torchcompat.core as accelerator
     from benchmate.observer import BenchObserver
 
-    from timm.utils.distributed import is_global_primary
-
     observer = BenchObserver(
         accelerator.Event, 
         earlystop=options.stop + options.skip,
-        rank=int(os.getenv("RANK", 0)),
-        device=accelerator.fetch_device(int(os.getenv("RANK", 0))),
         backward_callback=accelerator.mark_step,
         step_callback=accelerator.mark_step,
-        batch_size_fn=lambda x: len(x[0])
+        batch_size_fn=lambda x: len(x[0]),
+        stdout=False,
     )
 
     probe = ov.probe("/timm.data.loader/create_loader() as loader", overridable=True)
@@ -71,7 +68,7 @@ def instrument_main(ov, options: Config):
         monitor(poll_interval=options.gpu_poll) 
     ] 
 
-    if is_global_primary:
+    if int(os.getenv("RANK", 0)) == 0:
         instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))
 
     ov.require(*instruments)
diff --git a/benchmate/benchmate/benchrun.py b/benchmate/benchmate/benchrun.py
new file mode 100644
index 000000000..80c56758a
--- /dev/null
+++ b/benchmate/benchmate/benchrun.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+
+from torch.distributed.run import main as torchrun
+import torch.distributed.elastic.multiprocessing.api as elastic
+import torch.distributed.elastic.multiprocessing.subprocess_handler as sub
+
+
+class NewSubprocessHandler(sub.SubprocessHandler):
+    def _popen(self, args, env) -> subprocess.Popen:
+        kwargs = {}
+
+        if fd := os.getenv("DATA_FD"):
+            kwargs["pass_fds"] = [int(fd)]
+
+        return subprocess.Popen(
+            args=args,
+            env=env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            **kwargs,
+        )
+
+def get_subprocess_handler(
+    entrypoint: str,
+    args,
+    env,
+    stdout: str,
+    stderr: str,
+    local_rank_id: int,
+):
+    return NewSubprocessHandler(
+        entrypoint=entrypoint,
+        args=args,
+        env=env,
+        stdout=stdout,
+        stderr=stderr,
+        local_rank_id=local_rank_id,
+    )
+
+
+def main(args=None):
+    elastic.get_subprocess_handler = get_subprocess_handler
+    elastic.SubprocessHandler = NewSubprocessHandler
+
+    torchrun(args)
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmate/benchmate/metrics.py b/benchmate/benchmate/metrics.py
index 360291b19..e1ca5480c 100644
--- a/benchmate/benchmate/metrics.py
+++ b/benchmate/benchmate/metrics.py
@@ -39,6 +39,7 @@ def give_push():
     if ov is not None:
         return ov.give
 
+    print("No overseer found")
     return file_push()
 
 
diff --git a/benchmate/pyproject.toml b/benchmate/pyproject.toml
index 3b59a17ee..a5fd339d9 100644
--- a/benchmate/pyproject.toml
+++ b/benchmate/pyproject.toml
@@ -22,3 +22,7 @@ combine_as_imports = true
 
 [tool.poetry-git-version-plugin]
 alpha_version_format = '{version}a{distance}+{commit_hash}'
+
+
+[tool.poetry.scripts]
+benchrun = "benchmate.benchrun:main"
\ No newline at end of file
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index d4ae64861..399864fca 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -506,36 +506,50 @@ def _argv(self, **kwargs) -> List:
 
 
 class TorchrunAllGPU(WrapperCommand):
-    def __init__(self, executor: SingleCmdCommand, *torchrun_argv, **kwargs) -> None:
+    def __init__(self, executor: SingleCmdCommand, *torchrun_argv, module=False, **kwargs) -> None:
         # Some vendors force us to have weird venv that can resolve weirdly
         # use absolute paths to avoid issues
 
         binfolder = executor.pack.config["dirs"]["venv"]
+        self.module=module
+
+        # benchrun is a wrapper around torchrun
+        # which insert voir file descritor
         super().__init__(
-            executor, f"{binfolder}/bin/torchrun", *torchrun_argv, **kwargs
+            executor, f"{binfolder}/bin/benchrun", *torchrun_argv, **kwargs
         )
 
     def _argv(self, **kwargs):
         devices = self.pack.config.get("devices", [])
         nproc = len(devices)
         if nproc > 1:
-            argv = [*super()._argv(**kwargs), f"--nproc-per-node={nproc}"]
+            # spawn,fork,forkserver
+            argv = [
+                *super()._argv(**kwargs), 
+                f"--nproc-per-node={nproc}", 
+                # "--start-method=forkserver"
+            ]
 
             # Check if the sub-executor targets a module or not
             cmd = next(iter(self.exec.argv()), None)
 
-            if cmd:
-                # python or voir; tell it to not prepend python since we are doing it
-                if cmd in ("python", "voir"):
-                    argv.append("--no-python")
+            if self.module:
+                argv.append("-m")
+
+            else:
+                if cmd:
+                    # python or voir; tell it to not prepend python since we are doing it
+                    if cmd in ("python", "voir"):
+                        argv.append("--no-python")
 
-                # if the command exists and it is not a path assume it is a module
-                # script is not a file, maybe it is a module
-                elif not XPath(cmd).exists():
-                    argv.append("-m")
+                    # if the command exists and it is not a path assume it is a module
+                    # script is not a file, maybe it is a module
+                    elif not XPath(cmd).exists():
+                        argv.append("-m")
 
-            # everything after torchrun args are script args
-            argv.append("--")
+                # everything after torchrun args are script args
+                argv.append("--")
+            
             return argv
         return []
 
@@ -665,14 +679,23 @@ class VoirCommand(WrapperCommand):
         executor: `Command` to be executed
         *voir_argv: voir command line arguments list
         **kwargs: kwargs to be passed to the `pack.execute()`
+        module: bool use voir module instead of voir wrapper.
+            this is useful for torchrun since when a module is used
+            the main torchrun process can be reused for rank=0 enabling
+            voir to work using file descriptor.
     """
 
-    def __init__(self, executor: SingleCmdCommand, *voir_argv, **kwargs) -> None:
+    def __init__(self, executor: SingleCmdCommand, *voir_argv, module=False, **kwargs) -> None:
         # Some vendors force us to have weird venv that can resolve weirdly
         # use absolute paths to avoid issues
         binfolder = executor.pack.config["dirs"]["venv"]
+        voir = f"{binfolder}/bin/voir"
+
+        if module:
+            voir = "voir"
+
         super().__init__(
-            executor, f"{binfolder}/bin/voir", **{"setsid": True, **kwargs}
+            executor, voir, **{"setsid": True, **kwargs}
         )
         self.voir_argv = voir_argv
 
diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py
index 1ac456639..4ca96b80a 100644
--- a/milabench/commands/executors.py
+++ b/milabench/commands/executors.py
@@ -32,6 +32,8 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals
     sized_args = scale_argv(pack, args)
     final_args = resolve_argv(pack, sized_args)
 
+    print(cwd, use_stdout)
+
     return await run(
         final_args,
         **kwargs,
diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh
index dffe620f5..1144386c6 100644
--- a/scripts/article/run_cuda_dev.sh
+++ b/scripts/article/run_cuda_dev.sh
@@ -94,9 +94,11 @@ fi
 (
     . $MILABENCH_WORDIR/env/bin/activate
     pip install -e $MILABENCH_WORDIR/voir
+    pip install -e $MILABENCH_SOURCE/benchmate
 
     . $BENCHMARK_VENV/bin/activate
     pip install -e $MILABENCH_WORDIR/voir
+    pip install -e $MILABENCH_SOURCE/benchmate
 )
 
 if [ "$MILABENCH_PREPARE" -eq 0 ]; then