From 29e519b94737824663bb4cbb8d7cbec081059769 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Fri, 26 Jul 2024 11:46:54 -0400 Subject: [PATCH] Always set OMP_NUM_THREADS --- benchmarks/accelerate_opt/benchfile.py | 6 ---- benchmarks/diffusion/benchfile.py | 6 ---- benchmarks/dinov2/benchfile.py | 5 +-- benchmarks/dinov2/dev.yaml | 10 ------ benchmarks/lightning/benchfile.py | 6 ---- benchmarks/llama/benchfile.py | 6 ---- benchmarks/timm/benchfile.py | 6 ---- benchmate/benchmate/monitor.py | 24 +++++++++++--- config/base.yaml | 46 ++++++++++++++++++++++---- milabench/pack.py | 4 +++ milabench/sizer.py | 5 +++ 11 files changed, 68 insertions(+), 56 deletions(-) diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 746ee5f00..23ef7aba8 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -12,12 +12,6 @@ class AccelerateBenchmark(Package): base_requirements = "requirements.in" - def make_env(self): - env = super().make_env() - value = self.resolve_argument("--cpus_per_gpu", 8) - env["OMP_NUM_THREADS"] = str(value) - return env - def build_prepare_plan(self): return CmdCommand( self, diff --git a/benchmarks/diffusion/benchfile.py b/benchmarks/diffusion/benchfile.py index ed2614fbb..2458070ce 100644 --- a/benchmarks/diffusion/benchfile.py +++ b/benchmarks/diffusion/benchfile.py @@ -17,12 +17,6 @@ class Diffusion(Package): # You can remove the functions below if you don't need to modify them. - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } - async def install(self): await super().install() # super() call installs the requirements diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py index 901c146ec..ddfc4bc06 100644 --- a/benchmarks/dinov2/benchfile.py +++ b/benchmarks/dinov2/benchfile.py @@ -28,10 +28,7 @@ def working_directory(self): def make_env(self): # Return a dict of environment variables for prepare_script and # main_script. - return { - "OMP_NUM_THREADS": str(8), - **super().make_env() - } + return super().make_env() async def install(self): await super().install() diff --git a/benchmarks/dinov2/dev.yaml b/benchmarks/dinov2/dev.yaml index bef609deb..6868b18a6 100644 --- a/benchmarks/dinov2/dev.yaml +++ b/benchmarks/dinov2/dev.yaml @@ -11,16 +11,6 @@ _dinov2: --output-dir: "{milabench_extra}/output" --no-resume: true -dinov2-large: - inherits: _dinov2 - argv: - --config-file: src/dinov2/configs/train/vitl14.yaml - # THOSE NEED TO BE LAST - train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true - train.batch_size_per_gpu=32: true - train.saveckp_freq=100: true - train.num_workers=10: true - dinov2-giant: inherits: _dinov2 diff --git a/benchmarks/lightning/benchfile.py b/benchmarks/lightning/benchfile.py index 09926711f..8e2a4cf81 100644 --- a/benchmarks/lightning/benchfile.py +++ b/benchmarks/lightning/benchfile.py @@ -7,12 +7,6 @@ class LightningBenchmark(Package): prepare_script = "prepare.py" main_script = "main.py" - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } - def build_run_plan(self): # self.config is not the right config for this plan = super().build_run_plan() diff --git a/benchmarks/llama/benchfile.py b/benchmarks/llama/benchfile.py index b7bc0032e..977e825f5 100644 --- a/benchmarks/llama/benchfile.py +++ b/benchmarks/llama/benchfile.py @@ -6,12 +6,6 @@ class LLAMA(Package): base_requirements = "requirements.in" main_script = "main.py" - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } - async def install(self): await super().install() diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py index 94be19e6b..52a31ba1d 100644 --- a/benchmarks/timm/benchfile.py +++ b/benchmarks/timm/benchfile.py @@ -12,12 +12,6 @@ class TimmBenchmarkPack(Package): @property def working_directory(self): return self.dirs.code / "pytorch-image-models" - - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } @property def argv(self): diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 7064edb72..a2dc2a4a0 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -136,6 +136,13 @@ def milabench_sys_monitor(monogpu=False): +def get_rank(): + try: + return int(os.getenv("RANK", -1)) + except: + return -1 + + def voirfile_monitor(ov, options): from voir.instruments import early_stop, log, dash @@ -148,11 +155,18 @@ def voirfile_monitor(ov, options): ) ] - if int(os.getenv("RANK", 0)) == 0: - instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop")) - instruments.append(monitor_node(poll_interval=options.gpu_poll)) + rank = get_rank() - if os.getenv("RANK", -1) == -1: + # -1 & 0 early stop + if rank <= 0: + instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop")) + + # mono gpu if rank is not set + if rank == -1: instruments.append(monitor_monogpu(poll_interval=options.gpu_poll)) - + + # rank is set only monitor main rank + if rank == 0: + instruments.append(monitor_node(poll_interval=options.gpu_poll)) + ov.require(*instruments) diff --git a/config/base.yaml b/config/base.yaml index e47a78648..9cc2cca97 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -226,7 +226,7 @@ resnet50-noio: --batch-size: 256 --loader: synthetic_fixed -resnet152-ddp: +resnet152-ddp-gpus: inherits: _torchvision_ddp tags: - vision @@ -391,7 +391,7 @@ resnet152: --model: resnet152 --batch-size: 256 -resnet152-multi: +resnet152-gpus: inherits: resnet152 tags: - multigpu @@ -427,7 +427,7 @@ davit_large: --batch-size: 128 --lr-base: 0.01 -davit_large-multi: +davit_large-gpus: inherits: davit_large tags: - multigpu @@ -446,7 +446,7 @@ focalnet: argv: --model: focalnet_base_lrf -opt-1_3b: +opt-1_3b-gpus: inherits: _accelerate_opt tags: - multigpu @@ -458,7 +458,7 @@ opt-1_3b: use_deepspeed: false num_machines: 1 -opt-1_3b-multinode: +opt-1_3b-nodes: inherits: opt-1_3b tags: @@ -469,7 +469,7 @@ opt-1_3b-multinode: docker_image: "ghcr.io/mila-iqia/milabench:cuda-nightly" num_machines: 2 -opt-6_7b: +opt-6_7b-gpus: inherits: _accelerate_opt tags: - multigpu @@ -480,7 +480,7 @@ opt-6_7b: num_machines: 1 -opt-6_7b-multinode: +opt-6_7b-nodes: inherits: opt-6_7b tags: - multinode @@ -693,3 +693,35 @@ lightning-gpus: plan: method: njobs n: 1 + +_dinov2: + inherits: _defaults + definition: ../benchmarks/dinov2 + install_group: torch + plan: + method: njobs + n: 1 + + argv: + --output-dir: "{milabench_extra}/output" + --no-resume: true + +dinov2-large-gpus: + inherits: _dinov2 + argv: + --config-file: src/dinov2/configs/train/vitl14.yaml + # THOSE NEED TO BE LAST + train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true + train.batch_size_per_gpu=32: true + train.saveckp_freq=100: true + train.num_workers=10: true + +dinov2-giant-gpus: + inherits: _dinov2 + argv: + --config-file: src/dinov2/configs/train/vitg14.yaml + # THOSE NEED TO BE LAST + train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true + train.batch_size_per_gpu=32: true + train.saveckp_freq=100: true + train.num_workers=10: true diff --git a/milabench/pack.py b/milabench/pack.py index 214b4c7e1..60a5df2f7 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -329,11 +329,15 @@ def make_env(self): "MILABENCH_CONFIG": json.dumps(self.config), } """ + from .sizer import resolve_placeholder + env = { f"MILABENCH_DIR_{name.upper()}": path for name, path in self.config["dirs"].items() } + env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}") + env["MILABENCH_CONFIG"] = json.dumps(self.config) if self.phase == "prepare" or self.phase == "run": # XDG_CACHE_HOME controls basically all caches (pip, torch, huggingface, diff --git a/milabench/sizer.py b/milabench/sizer.py index cdcb57695..b2d4840f9 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -367,6 +367,11 @@ def auto_eval(arg): return auto_eval +def resolve_placeholder(pack, value): + resolver = new_argument_resolver(pack) + return resolver(value) + + def resolve_argv(pack, argv): resolver = new_argument_resolver(pack) argv = list(argv)