From 67047d6f4634ac37ad861b53e35afb088a05dcc2 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Tue, 23 Jul 2024 13:58:00 -0400 Subject: [PATCH] Fix TIMM benchmarks --- benchmarks/timm/benchfile.py | 11 +++++-- benchmarks/timm/voirfile.py | 9 ++---- benchmate/benchmate/benchrun.py | 53 +++++++++++++++++++++++++++++++++ benchmate/benchmate/metrics.py | 1 + benchmate/pyproject.toml | 4 +++ milabench/commands/__init__.py | 53 +++++++++++++++++++++++---------- milabench/commands/executors.py | 2 ++ scripts/article/run_cuda_dev.sh | 2 ++ 8 files changed, 111 insertions(+), 24 deletions(-) create mode 100644 benchmate/benchmate/benchrun.py diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py index cb93787d6..7e9220046 100644 --- a/benchmarks/timm/benchfile.py +++ b/benchmarks/timm/benchfile.py @@ -39,9 +39,14 @@ async def install(self): await super().install() def build_run_plan(self): - # self.config is not the right config for this - plan = super().build_run_plan() - return TorchRunCommand(plan, use_stdout=True) + import milabench.commands as cmd + main = self.dirs.code / self.main_script + + # torchrun ... -m voir ... train_script ... + return TorchRunCommand( + cmd.VoirCommand(cmd.PackCommand(self, *self.argv, lazy=True), cwd=main.parent, module=True), + module=True + ) __pack__ = TimmBenchmarkPack diff --git a/benchmarks/timm/voirfile.py b/benchmarks/timm/voirfile.py index 96922d144..b1b33cc49 100644 --- a/benchmarks/timm/voirfile.py +++ b/benchmarks/timm/voirfile.py @@ -35,16 +35,13 @@ def instrument_main(ov, options: Config): import torchcompat.core as accelerator from benchmate.observer import BenchObserver - from timm.utils.distributed import is_global_primary - observer = BenchObserver( accelerator.Event, earlystop=options.stop + options.skip, - rank=int(os.getenv("RANK", 0)), - device=accelerator.fetch_device(int(os.getenv("RANK", 0))), backward_callback=accelerator.mark_step, step_callback=accelerator.mark_step, - batch_size_fn=lambda x: len(x[0]) + batch_size_fn=lambda x: len(x[0]), + stdout=False, ) probe = ov.probe("/timm.data.loader/create_loader() as loader", overridable=True) @@ -71,7 +68,7 @@ def instrument_main(ov, options: Config): monitor(poll_interval=options.gpu_poll) ] - if is_global_primary: + if int(os.getenv("RANK", 0)) == 0: instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop")) ov.require(*instruments) diff --git a/benchmate/benchmate/benchrun.py b/benchmate/benchmate/benchrun.py new file mode 100644 index 000000000..80c56758a --- /dev/null +++ b/benchmate/benchmate/benchrun.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +import os +import subprocess + +from torch.distributed.run import main as torchrun +import torch.distributed.elastic.multiprocessing.api as elastic +import torch.distributed.elastic.multiprocessing.subprocess_handler as sub + + +class NewSubprocessHandler(sub.SubprocessHandler): + def _popen(self, args, env) -> subprocess.Popen: + kwargs = {} + + if fd := os.getenv("DATA_FD"): + kwargs["pass_fds"] = [int(fd)] + + return subprocess.Popen( + args=args, + env=env, + stdout=self._stdout, + stderr=self._stderr, + **kwargs, + ) + +def get_subprocess_handler( + entrypoint: str, + args, + env, + stdout: str, + stderr: str, + local_rank_id: int, +): + return NewSubprocessHandler( + entrypoint=entrypoint, + args=args, + env=env, + stdout=stdout, + stderr=stderr, + local_rank_id=local_rank_id, + ) + + +def main(args=None): + elastic.get_subprocess_handler = get_subprocess_handler + elastic.SubprocessHandler = NewSubprocessHandler + + torchrun(args) + + + +if __name__ == "__main__": + main() diff --git a/benchmate/benchmate/metrics.py b/benchmate/benchmate/metrics.py index 360291b19..e1ca5480c 100644 --- a/benchmate/benchmate/metrics.py +++ b/benchmate/benchmate/metrics.py @@ -39,6 +39,7 @@ def give_push(): if ov is not None: return ov.give + print("No overseer found") return file_push() diff --git a/benchmate/pyproject.toml b/benchmate/pyproject.toml index 3b59a17ee..a5fd339d9 100644 --- a/benchmate/pyproject.toml +++ b/benchmate/pyproject.toml @@ -22,3 +22,7 @@ combine_as_imports = true [tool.poetry-git-version-plugin] alpha_version_format = '{version}a{distance}+{commit_hash}' + + +[tool.poetry.scripts] +benchrun = "benchmate.benchrun:main" \ No newline at end of file diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index d4ae64861..399864fca 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -506,36 +506,50 @@ def _argv(self, **kwargs) -> List: class TorchrunAllGPU(WrapperCommand): - def __init__(self, executor: SingleCmdCommand, *torchrun_argv, **kwargs) -> None: + def __init__(self, executor: SingleCmdCommand, *torchrun_argv, module=False, **kwargs) -> None: # Some vendors force us to have weird venv that can resolve weirdly # use absolute paths to avoid issues binfolder = executor.pack.config["dirs"]["venv"] + self.module=module + + # benchrun is a wrapper around torchrun + # which insert voir file descritor super().__init__( - executor, f"{binfolder}/bin/torchrun", *torchrun_argv, **kwargs + executor, f"{binfolder}/bin/benchrun", *torchrun_argv, **kwargs ) def _argv(self, **kwargs): devices = self.pack.config.get("devices", []) nproc = len(devices) if nproc > 1: - argv = [*super()._argv(**kwargs), f"--nproc-per-node={nproc}"] + # spawn,fork,forkserver + argv = [ + *super()._argv(**kwargs), + f"--nproc-per-node={nproc}", + # "--start-method=forkserver" + ] # Check if the sub-executor targets a module or not cmd = next(iter(self.exec.argv()), None) - if cmd: - # python or voir; tell it to not prepend python since we are doing it - if cmd in ("python", "voir"): - argv.append("--no-python") + if self.module: + argv.append("-m") + + else: + if cmd: + # python or voir; tell it to not prepend python since we are doing it + if cmd in ("python", "voir"): + argv.append("--no-python") - # if the command exists and it is not a path assume it is a module - # script is not a file, maybe it is a module - elif not XPath(cmd).exists(): - argv.append("-m") + # if the command exists and it is not a path assume it is a module + # script is not a file, maybe it is a module + elif not XPath(cmd).exists(): + argv.append("-m") - # everything after torchrun args are script args - argv.append("--") + # everything after torchrun args are script args + argv.append("--") + return argv return [] @@ -665,14 +679,23 @@ class VoirCommand(WrapperCommand): executor: `Command` to be executed *voir_argv: voir command line arguments list **kwargs: kwargs to be passed to the `pack.execute()` + module: bool use voir module instead of voir wrapper. + this is useful for torchrun since when a module is used + the main torchrun process can be reused for rank=0 enabling + voir to work using file descriptor. """ - def __init__(self, executor: SingleCmdCommand, *voir_argv, **kwargs) -> None: + def __init__(self, executor: SingleCmdCommand, *voir_argv, module=False, **kwargs) -> None: # Some vendors force us to have weird venv that can resolve weirdly # use absolute paths to avoid issues binfolder = executor.pack.config["dirs"]["venv"] + voir = f"{binfolder}/bin/voir" + + if module: + voir = "voir" + super().__init__( - executor, f"{binfolder}/bin/voir", **{"setsid": True, **kwargs} + executor, voir, **{"setsid": True, **kwargs} ) self.voir_argv = voir_argv diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index 1ac456639..4ca96b80a 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -32,6 +32,8 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals sized_args = scale_argv(pack, args) final_args = resolve_argv(pack, sized_args) + print(cwd, use_stdout) + return await run( final_args, **kwargs, diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index dffe620f5..1144386c6 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -94,9 +94,11 @@ fi ( . $MILABENCH_WORDIR/env/bin/activate pip install -e $MILABENCH_WORDIR/voir + pip install -e $MILABENCH_SOURCE/benchmate . $BENCHMARK_VENV/bin/activate pip install -e $MILABENCH_WORDIR/voir + pip install -e $MILABENCH_SOURCE/benchmate ) if [ "$MILABENCH_PREPARE" -eq 0 ]; then