Skip to content

Commit

Permalink
Fix TIMM benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jul 23, 2024
1 parent eadc264 commit 67047d6
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 24 deletions.
11 changes: 8 additions & 3 deletions benchmarks/timm/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,14 @@ async def install(self):
await super().install()

def build_run_plan(self):
# self.config is not the right config for this
plan = super().build_run_plan()
return TorchRunCommand(plan, use_stdout=True)
import milabench.commands as cmd
main = self.dirs.code / self.main_script

# torchrun ... -m voir ... train_script ...
return TorchRunCommand(
cmd.VoirCommand(cmd.PackCommand(self, *self.argv, lazy=True), cwd=main.parent, module=True),
module=True
)


__pack__ = TimmBenchmarkPack
9 changes: 3 additions & 6 deletions benchmarks/timm/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,13 @@ def instrument_main(ov, options: Config):
import torchcompat.core as accelerator
from benchmate.observer import BenchObserver

from timm.utils.distributed import is_global_primary

observer = BenchObserver(
accelerator.Event,
earlystop=options.stop + options.skip,
rank=int(os.getenv("RANK", 0)),
device=accelerator.fetch_device(int(os.getenv("RANK", 0))),
backward_callback=accelerator.mark_step,
step_callback=accelerator.mark_step,
batch_size_fn=lambda x: len(x[0])
batch_size_fn=lambda x: len(x[0]),
stdout=False,
)

probe = ov.probe("/timm.data.loader/create_loader() as loader", overridable=True)
Expand All @@ -71,7 +68,7 @@ def instrument_main(ov, options: Config):
monitor(poll_interval=options.gpu_poll)
]

if is_global_primary:
if int(os.getenv("RANK", 0)) == 0:
instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))

ov.require(*instruments)
Expand Down
53 changes: 53 additions & 0 deletions benchmate/benchmate/benchrun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python3

import os
import subprocess

from torch.distributed.run import main as torchrun
import torch.distributed.elastic.multiprocessing.api as elastic
import torch.distributed.elastic.multiprocessing.subprocess_handler as sub


class NewSubprocessHandler(sub.SubprocessHandler):
def _popen(self, args, env) -> subprocess.Popen:
kwargs = {}

if fd := os.getenv("DATA_FD"):
kwargs["pass_fds"] = [int(fd)]

return subprocess.Popen(
args=args,
env=env,
stdout=self._stdout,
stderr=self._stderr,
**kwargs,
)

def get_subprocess_handler(
entrypoint: str,
args,
env,
stdout: str,
stderr: str,
local_rank_id: int,
):
return NewSubprocessHandler(
entrypoint=entrypoint,
args=args,
env=env,
stdout=stdout,
stderr=stderr,
local_rank_id=local_rank_id,
)


def main(args=None):
elastic.get_subprocess_handler = get_subprocess_handler
elastic.SubprocessHandler = NewSubprocessHandler

torchrun(args)



if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions benchmate/benchmate/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def give_push():
if ov is not None:
return ov.give

print("No overseer found")
return file_push()


Expand Down
4 changes: 4 additions & 0 deletions benchmate/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ combine_as_imports = true

[tool.poetry-git-version-plugin]
alpha_version_format = '{version}a{distance}+{commit_hash}'


[tool.poetry.scripts]
benchrun = "benchmate.benchrun:main"
53 changes: 38 additions & 15 deletions milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,36 +506,50 @@ def _argv(self, **kwargs) -> List:


class TorchrunAllGPU(WrapperCommand):
def __init__(self, executor: SingleCmdCommand, *torchrun_argv, **kwargs) -> None:
def __init__(self, executor: SingleCmdCommand, *torchrun_argv, module=False, **kwargs) -> None:
# Some vendors force us to have weird venv that can resolve weirdly
# use absolute paths to avoid issues

binfolder = executor.pack.config["dirs"]["venv"]
self.module=module

# benchrun is a wrapper around torchrun
# which insert voir file descritor
super().__init__(
executor, f"{binfolder}/bin/torchrun", *torchrun_argv, **kwargs
executor, f"{binfolder}/bin/benchrun", *torchrun_argv, **kwargs
)

def _argv(self, **kwargs):
devices = self.pack.config.get("devices", [])
nproc = len(devices)
if nproc > 1:
argv = [*super()._argv(**kwargs), f"--nproc-per-node={nproc}"]
# spawn,fork,forkserver
argv = [
*super()._argv(**kwargs),
f"--nproc-per-node={nproc}",
# "--start-method=forkserver"
]

# Check if the sub-executor targets a module or not
cmd = next(iter(self.exec.argv()), None)

if cmd:
# python or voir; tell it to not prepend python since we are doing it
if cmd in ("python", "voir"):
argv.append("--no-python")
if self.module:
argv.append("-m")

else:
if cmd:
# python or voir; tell it to not prepend python since we are doing it
if cmd in ("python", "voir"):
argv.append("--no-python")

# if the command exists and it is not a path assume it is a module
# script is not a file, maybe it is a module
elif not XPath(cmd).exists():
argv.append("-m")
# if the command exists and it is not a path assume it is a module
# script is not a file, maybe it is a module
elif not XPath(cmd).exists():
argv.append("-m")

# everything after torchrun args are script args
argv.append("--")
# everything after torchrun args are script args
argv.append("--")

return argv
return []

Expand Down Expand Up @@ -665,14 +679,23 @@ class VoirCommand(WrapperCommand):
executor: `Command` to be executed
*voir_argv: voir command line arguments list
**kwargs: kwargs to be passed to the `pack.execute()`
module: bool use voir module instead of voir wrapper.
this is useful for torchrun since when a module is used
the main torchrun process can be reused for rank=0 enabling
voir to work using file descriptor.
"""

def __init__(self, executor: SingleCmdCommand, *voir_argv, **kwargs) -> None:
def __init__(self, executor: SingleCmdCommand, *voir_argv, module=False, **kwargs) -> None:
# Some vendors force us to have weird venv that can resolve weirdly
# use absolute paths to avoid issues
binfolder = executor.pack.config["dirs"]["venv"]
voir = f"{binfolder}/bin/voir"

if module:
voir = "voir"

super().__init__(
executor, f"{binfolder}/bin/voir", **{"setsid": True, **kwargs}
executor, voir, **{"setsid": True, **kwargs}
)
self.voir_argv = voir_argv

Expand Down
2 changes: 2 additions & 0 deletions milabench/commands/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals
sized_args = scale_argv(pack, args)
final_args = resolve_argv(pack, sized_args)

print(cwd, use_stdout)

return await run(
final_args,
**kwargs,
Expand Down
2 changes: 2 additions & 0 deletions scripts/article/run_cuda_dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,11 @@ fi
(
. $MILABENCH_WORDIR/env/bin/activate
pip install -e $MILABENCH_WORDIR/voir
pip install -e $MILABENCH_SOURCE/benchmate

. $BENCHMARK_VENV/bin/activate
pip install -e $MILABENCH_WORDIR/voir
pip install -e $MILABENCH_SOURCE/benchmate
)

if [ "$MILABENCH_PREPARE" -eq 0 ]; then
Expand Down

0 comments on commit 67047d6

Please sign in to comment.