Skip to content

Commit

Permalink
Always set OMP_NUM_THREADS
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jul 26, 2024
1 parent f96c947 commit 29e519b
Show file tree
Hide file tree
Showing 11 changed files with 68 additions and 56 deletions.
6 changes: 0 additions & 6 deletions benchmarks/accelerate_opt/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,6 @@
class AccelerateBenchmark(Package):
base_requirements = "requirements.in"

def make_env(self):
env = super().make_env()
value = self.resolve_argument("--cpus_per_gpu", 8)
env["OMP_NUM_THREADS"] = str(value)
return env

def build_prepare_plan(self):
return CmdCommand(
self,
Expand Down
6 changes: 0 additions & 6 deletions benchmarks/diffusion/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,6 @@ class Diffusion(Package):

# You can remove the functions below if you don't need to modify them.

def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
}

async def install(self):
await super().install() # super() call installs the requirements

Expand Down
5 changes: 1 addition & 4 deletions benchmarks/dinov2/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,7 @@ def working_directory(self):
def make_env(self):
# Return a dict of environment variables for prepare_script and
# main_script.
return {
"OMP_NUM_THREADS": str(8),
**super().make_env()
}
return super().make_env()

async def install(self):
await super().install()
Expand Down
10 changes: 0 additions & 10 deletions benchmarks/dinov2/dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,6 @@ _dinov2:
--output-dir: "{milabench_extra}/output"
--no-resume: true

dinov2-large:
inherits: _dinov2
argv:
--config-file: src/dinov2/configs/train/vitl14.yaml
# THOSE NEED TO BE LAST
train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
train.batch_size_per_gpu=32: true
train.saveckp_freq=100: true
train.num_workers=10: true


dinov2-giant:
inherits: _dinov2
Expand Down
6 changes: 0 additions & 6 deletions benchmarks/lightning/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@ class LightningBenchmark(Package):
prepare_script = "prepare.py"
main_script = "main.py"

def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
}

def build_run_plan(self):
# self.config is not the right config for this
plan = super().build_run_plan()
Expand Down
6 changes: 0 additions & 6 deletions benchmarks/llama/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@ class LLAMA(Package):
base_requirements = "requirements.in"
main_script = "main.py"

def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
}

async def install(self):
await super().install()

Expand Down
6 changes: 0 additions & 6 deletions benchmarks/timm/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,6 @@ class TimmBenchmarkPack(Package):
@property
def working_directory(self):
return self.dirs.code / "pytorch-image-models"

def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
}

@property
def argv(self):
Expand Down
24 changes: 19 additions & 5 deletions benchmate/benchmate/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,13 @@ def milabench_sys_monitor(monogpu=False):



def get_rank():
try:
return int(os.getenv("RANK", -1))
except:
return -1


def voirfile_monitor(ov, options):
from voir.instruments import early_stop, log, dash

Expand All @@ -148,11 +155,18 @@ def voirfile_monitor(ov, options):
)
]

if int(os.getenv("RANK", 0)) == 0:
instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))
instruments.append(monitor_node(poll_interval=options.gpu_poll))
rank = get_rank()

if os.getenv("RANK", -1) == -1:
# -1 & 0 early stop
if rank <= 0:
instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))

# mono gpu if rank is not set
if rank == -1:
instruments.append(monitor_monogpu(poll_interval=options.gpu_poll))


# rank is set only monitor main rank
if rank == 0:
instruments.append(monitor_node(poll_interval=options.gpu_poll))

ov.require(*instruments)
46 changes: 39 additions & 7 deletions config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ resnet50-noio:
--batch-size: 256
--loader: synthetic_fixed

resnet152-ddp:
resnet152-ddp-gpus:
inherits: _torchvision_ddp
tags:
- vision
Expand Down Expand Up @@ -391,7 +391,7 @@ resnet152:
--model: resnet152
--batch-size: 256

resnet152-multi:
resnet152-gpus:
inherits: resnet152
tags:
- multigpu
Expand Down Expand Up @@ -427,7 +427,7 @@ davit_large:
--batch-size: 128
--lr-base: 0.01

davit_large-multi:
davit_large-gpus:
inherits: davit_large
tags:
- multigpu
Expand All @@ -446,7 +446,7 @@ focalnet:
argv:
--model: focalnet_base_lrf

opt-1_3b:
opt-1_3b-gpus:
inherits: _accelerate_opt
tags:
- multigpu
Expand All @@ -458,7 +458,7 @@ opt-1_3b:
use_deepspeed: false
num_machines: 1

opt-1_3b-multinode:
opt-1_3b-nodes:
inherits: opt-1_3b

tags:
Expand All @@ -469,7 +469,7 @@ opt-1_3b-multinode:
docker_image: "ghcr.io/mila-iqia/milabench:cuda-nightly"
num_machines: 2

opt-6_7b:
opt-6_7b-gpus:
inherits: _accelerate_opt
tags:
- multigpu
Expand All @@ -480,7 +480,7 @@ opt-6_7b:

num_machines: 1

opt-6_7b-multinode:
opt-6_7b-nodes:
inherits: opt-6_7b
tags:
- multinode
Expand Down Expand Up @@ -693,3 +693,35 @@ lightning-gpus:
plan:
method: njobs
n: 1

_dinov2:
inherits: _defaults
definition: ../benchmarks/dinov2
install_group: torch
plan:
method: njobs
n: 1

argv:
--output-dir: "{milabench_extra}/output"
--no-resume: true

dinov2-large-gpus:
inherits: _dinov2
argv:
--config-file: src/dinov2/configs/train/vitl14.yaml
# THOSE NEED TO BE LAST
train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
train.batch_size_per_gpu=32: true
train.saveckp_freq=100: true
train.num_workers=10: true

dinov2-giant-gpus:
inherits: _dinov2
argv:
--config-file: src/dinov2/configs/train/vitg14.yaml
# THOSE NEED TO BE LAST
train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
train.batch_size_per_gpu=32: true
train.saveckp_freq=100: true
train.num_workers=10: true
4 changes: 4 additions & 0 deletions milabench/pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,15 @@ def make_env(self):
"MILABENCH_CONFIG": json.dumps(self.config),
}
"""
from .sizer import resolve_placeholder

env = {
f"MILABENCH_DIR_{name.upper()}": path
for name, path in self.config["dirs"].items()
}

env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}")

env["MILABENCH_CONFIG"] = json.dumps(self.config)
if self.phase == "prepare" or self.phase == "run":
# XDG_CACHE_HOME controls basically all caches (pip, torch, huggingface,
Expand Down
5 changes: 5 additions & 0 deletions milabench/sizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,11 @@ def auto_eval(arg):
return auto_eval


def resolve_placeholder(pack, value):
resolver = new_argument_resolver(pack)
return resolver(value)


def resolve_argv(pack, argv):
resolver = new_argument_resolver(pack)
argv = list(argv)
Expand Down

0 comments on commit 29e519b

Please sign in to comment.