diff --git a/.gitignore b/.gitignore index 284d84773..482e776df 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,4 @@ benchmarks/voir benchmarks/*/base/ benchmarks/lightning/lightning_logs/ +benchmarks/*/src/ \ No newline at end of file diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt index 4527b5fb9..15343ce73 100644 --- a/.pin/constraints-cuda-torch.txt +++ b/.pin/constraints-cuda-torch.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in # --extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 @@ -58,7 +58,9 @@ chex==0.1.86 click==8.1.7 # via flask cloudpickle==3.0.0 - # via gym + # via + # gym + # submitit codefind==0.1.6 # via ptera contextlib2==21.6.0 @@ -137,6 +139,8 @@ fsspec[http]==2024.5.0 # torchx future==1.0.0 # via -r benchmarks/dlrm/requirements.in +fvcore==0.1.5.post20221221 + # via -r benchmarks/dinov2/requirements.in gdown==5.2.0 # via -r benchmarks/stargan/requirements.in giving==0.4.2 @@ -181,6 +185,10 @@ importlib-resources==6.4.0 # argklass # etils # torchcompat +iopath==0.1.10 + # via + # -r benchmarks/dinov2/requirements.in + # fvcore itsdangerous==2.2.0 # via flask jax[cuda12]==0.4.28 @@ -283,6 +291,7 @@ numpy==1.26.4 # fairscale # fbgemm-gpu # flax + # fvcore # gym # jax # jaxlib @@ -307,6 +316,7 @@ numpy==1.26.4 # torchvision # transformers # trimesh + # xformers nvidia-cublas-cu12==12.1.3.1 # via # jax @@ -358,7 +368,9 @@ nvidia-nvjitlink-cu12==12.5.82 nvidia-nvtx-cu12==12.1.105 # via torch omegaconf==2.3.0 - # via voir + # via + # -r benchmarks/dinov2/requirements.in + # voir onnx==1.16.1 # via -r benchmarks/dlrm/requirements.in opencv-python==4.10.0.84 @@ -394,9 +406,13 @@ pandas==2.2.2 # evaluate pillow==10.4.0 # via + # -r benchmarks/huggingface/requirements.in # brax # diffusers + # fvcore # torchvision +portalocker==2.10.1 + # via iopath protobuf==4.25.3 # via # onnx @@ -448,6 +464,7 @@ pyyaml==6.0.1 # accelerate # datasets # flax + # fvcore # huggingface-hub # lightning # ml-collections @@ -456,6 +473,7 @@ pyyaml==6.0.1 # pytorch-lightning # torchx # transformers + # yacs reactivex==4.0.4 # via giving regex==2024.5.15 @@ -486,6 +504,7 @@ scikit-learn==1.5.1 # via -r benchmarks/dlrm/requirements.in scipy==1.14.0 # via + # -r benchmarks/dinov2/requirements.in # brax # jax # jaxlib @@ -503,10 +522,14 @@ six==1.16.0 # tensorboard soupsieve==2.5 # via beautifulsoup4 +submitit==1.5.1 + # via -r benchmarks/dinov2/requirements.in sympy==1.13.1 # via torch tabulate==0.9.0 - # via torchx + # via + # fvcore + # torchx tensorboard==2.17.0 # via -r benchmarks/dlrm/requirements.in tensorboard-data-server==0.7.2 @@ -518,7 +541,9 @@ tensorstore==0.1.63 # flax # orbax-checkpoint termcolor==2.4.0 - # via fire + # via + # fire + # fvcore threadpoolctl==3.5.0 # via scikit-learn tokenizers==0.19.1 @@ -529,6 +554,7 @@ torch==2.3.1+cu121 # via # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/dlrm/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in @@ -549,6 +575,7 @@ torch==2.3.1+cu121 # torchmetrics # torchvision # torchviz + # xformers torchaudio==2.3.1+cu121 # via -r benchmarks/accelerate_opt/requirements.in torchcompat==1.1.4 @@ -560,6 +587,7 @@ torchcompat==1.1.4 # -r benchmarks/torchvision_ddp/requirements.in torchmetrics==1.0.3 # via + # -r benchmarks/dinov2/requirements.in # lightning # pytorch-lightning # torchrec @@ -569,6 +597,7 @@ torchvision==0.18.1+cu121 # via # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/lightning/requirements.in # -r benchmarks/stargan/requirements.in @@ -591,8 +620,10 @@ tqdm==4.66.4 # datasets # deepspeed # evaluate + # fvcore # gdown # huggingface-hub + # iopath # lightning # pytorch-lightning # torchrec @@ -616,6 +647,7 @@ typing-extensions==4.12.2 # etils # flax # huggingface-hub + # iopath # lightning # lightning-utilities # orbax-checkpoint @@ -624,6 +656,7 @@ typing-extensions==4.12.2 # pyre-extensions # pytorch-lightning # reactivex + # submitit # torch # typing-inspect typing-inspect==0.9.0 @@ -637,12 +670,13 @@ urllib3==1.26.19 # torchx varname==0.10.0 # via giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../constraints/cuda.txt # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/dlrm/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in @@ -657,10 +691,14 @@ werkzeug==3.0.3 # via # flask # tensorboard +xformers==0.0.27 + # via -r benchmarks/dinov2/requirements.in xxhash==3.4.1 # via # datasets # evaluate +yacs==0.1.8 + # via fvcore yarl==1.9.4 # via aiohttp zipp==3.19.2 diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt index bf7d5b5f8..0c6d3ff9b 100644 --- a/.pin/constraints-hpu-torch.txt +++ b/.pin/constraints-hpu-torch.txt @@ -586,7 +586,7 @@ urllib3==1.26.19 # torchx varname==0.10.0 # via giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../constraints/hpu.txt # -r benchmarks/accelerate_opt/requirements.in diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt index 26e15dc77..ec212c16b 100644 --- a/.pin/constraints-rocm-torch.txt +++ b/.pin/constraints-rocm-torch.txt @@ -566,7 +566,7 @@ urllib3==1.26.19 # torchx varname==0.10.0 # via giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../constraints/rocm.txt # -r benchmarks/accelerate_opt/requirements.in diff --git a/.pin/constraints-xpu-torch.txt b/.pin/constraints-xpu-torch.txt index 266c72861..91cf0dceb 100644 --- a/.pin/constraints-xpu-torch.txt +++ b/.pin/constraints-xpu-torch.txt @@ -568,7 +568,7 @@ urllib3==1.26.19 # torchx varname==0.10.0 # via giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../constraints/xpu.txt # -r benchmarks/accelerate_opt/requirements.in diff --git a/benchmarks/_templates/simple/requirements.in b/benchmarks/_templates/simple/requirements.in index 94575179d..3c2db790a 100644 --- a/benchmarks/_templates/simple/requirements.in +++ b/benchmarks/_templates/simple/requirements.in @@ -1,2 +1,2 @@ -voir>=0.2.9,<0.3 +voir>=0.2.17,<0.3 torch \ No newline at end of file diff --git a/benchmarks/_templates/stdout/main.py b/benchmarks/_templates/stdout/main.py index f8013bb0f..3bf94d63a 100644 --- a/benchmarks/_templates/stdout/main.py +++ b/benchmarks/_templates/stdout/main.py @@ -15,32 +15,45 @@ def criterion(*args, **kwargs): return random.normalvariate(0, 1) -def main(): - device = accelerator.fetch_device(0) # <= This is your cuda device +def prepare_voir(): + from benchmate.observer import BenchObserver + from benchmate.monitor import bench_monitor observer = BenchObserver( - batch_size_fn=lambda batch: 1, + accelerator.Event, + earlystop=65, + batch_size_fn=lambda x: len(x[0]), + raise_stop_program=False, stdout=True, ) + + return observer, bench_monitor + + +def main(): + device = accelerator.fetch_device(0) # <= This is your cuda device + + observer, monitor = prepare_voir() + # optimizer = observer.optimizer(optimizer) - # criterion = observer.criterion(criterion) dataloader = list(range(6000)) - for epoch in range(10000): - for i in observer.iterate(dataloader): - # avoid .item() - # avoid torch.cuda; use accelerator from torchcompat instead - # avoid torch.cuda.synchronize or accelerator.synchronize - - # y = model(i) - loss = criterion() - # loss.backward() - # optimizer.step() - - observer.record_loss(loss) - - time.sleep(0.1) + with monitor(): + for epoch in range(10000): + for i in observer.iterate(dataloader): + # avoid .item() + # avoid torch.cuda; use accelerator from torchcompat instead + # avoid torch.cuda.synchronize or accelerator.synchronize + + # y = model(i) + loss = criterion() + # loss.backward() + # optimizer.step() + + observer.record_loss(loss) + + time.sleep(0.1) assert epoch < 2, "milabench stopped the train script before the end of training" assert i < 72, "milabench stopped the train script before the end of training" diff --git a/benchmarks/_templates/stdout/requirements.in b/benchmarks/_templates/stdout/requirements.in index 94575179d..3c2db790a 100644 --- a/benchmarks/_templates/stdout/requirements.in +++ b/benchmarks/_templates/stdout/requirements.in @@ -1,2 +1,2 @@ -voir>=0.2.9,<0.3 +voir>=0.2.17,<0.3 torch \ No newline at end of file diff --git a/benchmarks/_templates/voir/requirements.in b/benchmarks/_templates/voir/requirements.in index 94575179d..3c2db790a 100644 --- a/benchmarks/_templates/voir/requirements.in +++ b/benchmarks/_templates/voir/requirements.in @@ -1,2 +1,2 @@ -voir>=0.2.9,<0.3 +voir>=0.2.17,<0.3 torch \ No newline at end of file diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 746ee5f00..23ef7aba8 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -12,12 +12,6 @@ class AccelerateBenchmark(Package): base_requirements = "requirements.in" - def make_env(self): - env = super().make_env() - value = self.resolve_argument("--cpus_per_gpu", 8) - env["OMP_NUM_THREADS"] = str(value) - return env - def build_prepare_plan(self): return CmdCommand( self, diff --git a/benchmarks/accelerate_opt/requirements.cuda.txt b/benchmarks/accelerate_opt/requirements.cuda.txt index 7c629e53d..7d0efa24f 100644 --- a/benchmarks/accelerate_opt/requirements.cuda.txt +++ b/benchmarks/accelerate_opt/requirements.cuda.txt @@ -394,7 +394,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/accelerate_opt/requirements.hpu.txt b/benchmarks/accelerate_opt/requirements.hpu.txt index 263b23b58..cff04e771 100644 --- a/benchmarks/accelerate_opt/requirements.hpu.txt +++ b/benchmarks/accelerate_opt/requirements.hpu.txt @@ -393,7 +393,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/accelerate_opt/requirements.rocm.txt b/benchmarks/accelerate_opt/requirements.rocm.txt index 4767bb31a..4eb7b7dde 100644 --- a/benchmarks/accelerate_opt/requirements.rocm.txt +++ b/benchmarks/accelerate_opt/requirements.rocm.txt @@ -342,7 +342,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/accelerate_opt/requirements.xpu.txt b/benchmarks/accelerate_opt/requirements.xpu.txt index 43c58b6b5..8dbb8e6c0 100644 --- a/benchmarks/accelerate_opt/requirements.xpu.txt +++ b/benchmarks/accelerate_opt/requirements.xpu.txt @@ -341,7 +341,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/brax/requirements.cuda.txt index 2a424416a..ea6216a23 100644 --- a/benchmarks/brax/requirements.cuda.txt +++ b/benchmarks/brax/requirements.cuda.txt @@ -433,7 +433,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/brax/requirements.hpu.txt b/benchmarks/brax/requirements.hpu.txt index 697cc34c5..204757ced 100644 --- a/benchmarks/brax/requirements.hpu.txt +++ b/benchmarks/brax/requirements.hpu.txt @@ -432,7 +432,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt index d73451da4..ea219cfb4 100644 --- a/benchmarks/brax/requirements.rocm.txt +++ b/benchmarks/brax/requirements.rocm.txt @@ -414,7 +414,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/brax/requirements.xpu.txt b/benchmarks/brax/requirements.xpu.txt index ae6d4212c..c08c7bdaa 100644 --- a/benchmarks/brax/requirements.xpu.txt +++ b/benchmarks/brax/requirements.xpu.txt @@ -410,7 +410,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/diffusion/benchfile.py b/benchmarks/diffusion/benchfile.py index ed2614fbb..2458070ce 100644 --- a/benchmarks/diffusion/benchfile.py +++ b/benchmarks/diffusion/benchfile.py @@ -17,12 +17,6 @@ class Diffusion(Package): # You can remove the functions below if you don't need to modify them. - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } - async def install(self): await super().install() # super() call installs the requirements diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py index cff53451a..bd6668dab 100644 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -125,17 +125,28 @@ def collate_fn(examples): input_ids = torch.stack([example["input_ids"] for example in examples]) return {"pixel_values": pixel_values, "input_ids": input_ids} + import os + total_samples = args.batch_size * 70 * int(os.getenv("WORLD_SIZE", 1)) + # DataLoaders creation: return torch.utils.data.DataLoader( train_dataset, - shuffle=True, + shuffle=False, + # This should be a distributed sampler + # but the dataset is a bit small so epochs are too small as well + sampler=torch.utils.data.RandomSampler( + train_dataset, + replacement=True, + num_samples=total_samples + ), collate_fn=collate_fn, batch_size=args.batch_size, num_workers=args.num_workers, persistent_workers=True, ) -def train(args: Arguments): + +def train(observer, args: Arguments): weight_dtype = torch.bfloat16 accelerator = Accelerator( @@ -145,18 +156,6 @@ def train(args: Arguments): loader = dataset(accelerator, args) - from benchmate.observer import BenchObserver - - def batch_size(x): - return x["pixel_values"].shape[0] - - observer = BenchObserver( - earlystop=60, - raise_stop_program=True, - batch_size_fn=batch_size, - stdout=True - ) - encoder, vae, unet = models(accelerator, args) optimizer = torch.optim.AdamW( @@ -213,18 +212,37 @@ def batch_size(x): optimizer.zero_grad() + +def prepare_voir(): + from benchmate.observer import BenchObserver + from benchmate.monitor import bench_monitor + def batch_size(x): + return x["pixel_values"].shape[0] + + observer = BenchObserver( + earlystop=60, + raise_stop_program=True, + batch_size_fn=batch_size, + stdout=True + ) + + return observer, bench_monitor + def main(): from benchmate.metrics import StopProgram - try: - from argklass import ArgumentParser - parser = ArgumentParser() - parser.add_arguments(Arguments) - config, _ = parser.parse_known_args() + observer, monitor = prepare_voir() + + with monitor(): + try: + from argklass import ArgumentParser + parser = ArgumentParser() + parser.add_arguments(Arguments) + config, _ = parser.parse_known_args() - train(config) - except StopProgram: - pass + train(observer, config) + except StopProgram: + pass if __name__ == "__main__": diff --git a/benchmarks/diffusion/requirements.cuda.txt b/benchmarks/diffusion/requirements.cuda.txt index 16d5d651a..c628dd36f 100644 --- a/benchmarks/diffusion/requirements.cuda.txt +++ b/benchmarks/diffusion/requirements.cuda.txt @@ -364,7 +364,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/diffusion/requirements.in b/benchmarks/diffusion/requirements.in index 4a7a2c824..f1a0c6310 100644 --- a/benchmarks/diffusion/requirements.in +++ b/benchmarks/diffusion/requirements.in @@ -1,4 +1,4 @@ -voir>=0.2.9,<0.3 +voir>=0.2.17,<0.3 diffusers diffusers[torch] accelerate diff --git a/benchmarks/dinov2/Makefile b/benchmarks/dinov2/Makefile new file mode 100644 index 000000000..dcffcb373 --- /dev/null +++ b/benchmarks/dinov2/Makefile @@ -0,0 +1,31 @@ +# Use global base if possible +ifndef MILABENCH_BASE + MILABENCH_BASE="base" +endif + +export MILABENCH_BASE + +BENCH_NAME=dinov2-giant-gpus +MILABENCH_CONFIG=dev.yaml +MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE) + +all: + install prepare single gpus nodes + +install: + milabench install $(MILABENCH_ARGS) --force + +prepare: + milabench prepare $(MILABENCH_ARGS) + +tests: install prepare + milabench run $(MILABENCH_ARGS) + +single: + CUDA_VISIBLE_DEVICES=0 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME) + +gpus: + MILABENCH_SIZER_BATCH_SIZE=16 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME) + +nodes: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME) diff --git a/benchmarks/dinov2/README.md b/benchmarks/dinov2/README.md new file mode 100644 index 000000000..ba5695ac6 --- /dev/null +++ b/benchmarks/dinov2/README.md @@ -0,0 +1,4 @@ + +# Dinov2 + +Rewrite this README to explain what the benchmark is! diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py new file mode 100644 index 000000000..ddfc4bc06 --- /dev/null +++ b/benchmarks/dinov2/benchfile.py @@ -0,0 +1,53 @@ +from milabench.pack import Package +from milabench.commands import TorchrunAllNodes, TorchrunAllGPU, ListCommand + + +SOURCE_DIR = "src" +REPO_URL = "https://github.com/facebookresearch/dinov2" +BRANCH = "e1277af2ba9496fbadf7aec6eba56e8d882d1e35" + + +class Dinov2(Package): + # Requirements file installed by install(). It can be empty or absent. + base_requirements = "requirements.in" + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + + @property + def working_directory(self): + return self.dirs.code / SOURCE_DIR + + def make_env(self): + # Return a dict of environment variables for prepare_script and + # main_script. + return super().make_env() + + async def install(self): + await super().install() + + source_destination = self.dirs.code / SOURCE_DIR + if not source_destination.exists(): + source_destination.clone_subtree( + REPO_URL, BRANCH + ) + + async def prepare(self): + await super().prepare() # super() call executes prepare_script + + def build_run_plan(self): + # self.config is not the right config for this + plan = super().build_run_plan() + + return TorchrunAllNodes(plan).use_stdout() + + + +__pack__ = Dinov2 diff --git a/benchmarks/dinov2/dev.yaml b/benchmarks/dinov2/dev.yaml new file mode 100644 index 000000000..6c8411d3d --- /dev/null +++ b/benchmarks/dinov2/dev.yaml @@ -0,0 +1,24 @@ + +_dinov2: + inherits: _defaults + definition: . + install-variant: unpinned + install_group: torch + plan: + method: njobs + n: 1 + + argv: + --output-dir: "{milabench_extra}/output" + --no-resume: true + + +dinov2-giant-gpus: + inherits: _dinov2 + argv: + --config-file: src/dinov2/configs/train/vitg14.yaml + # THOSE NEED TO BE LAST + train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true + train.batch_size_per_gpu=32: true + train.saveckp_freq=100: true + train.num_workers=10: true diff --git a/benchmarks/dinov2/main.py b/benchmarks/dinov2/main.py new file mode 100755 index 000000000..6afcfb730 --- /dev/null +++ b/benchmarks/dinov2/main.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +import os + + +if __name__ == "__main__": + import sys + sys.path.append(os.path.dirname(__file__) + "/src/") + + from dinov2.train.train import main, get_args_parser + args = get_args_parser(add_help=True).parse_args() + main(args) diff --git a/benchmarks/dinov2/prepare.py b/benchmarks/dinov2/prepare.py new file mode 100755 index 000000000..b4c49f428 --- /dev/null +++ b/benchmarks/dinov2/prepare.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import os +from benchmate.datagen import generate_fakeimagenet, device_count + + +if __name__ == "__main__": + import os + import sys + sys.path.append(os.path.dirname(__file__) + "/src/") + + if job_id := os.getenv("SLURM_JOB_ID"): + del os.environ["SLURM_JOB_ID"] + + from argparse import Namespace + from dinov2.train.train import setup, get_args_parser + + args = get_args_parser(add_help=True).parse_args() + cfg = setup(args) + + args = Namespace( + batch_size=cfg["train"]["batch_size_per_gpu"], + batch_count=60, + device_count=device_count(), + device=None, + image_size=[3, 384, 384], + val=0.1, + test=0.1 + ) + # + generate_fakeimagenet(args) diff --git a/benchmarks/dinov2/requirements.cuda.txt b/benchmarks/dinov2/requirements.cuda.txt new file mode 100644 index 000000000..a92790725 --- /dev/null +++ b/benchmarks/dinov2/requirements.cuda.txt @@ -0,0 +1,271 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/dinov2/requirements.cuda.txt .pin/tmp-constraints-cuda-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in +# +--extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cu121 +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +--trusted-host pypi.ngc.nvidia.com + +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # giving +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # submitit +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch + # triton +fsspec==2024.5.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +fvcore==0.1.5.post20221221 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # ptera + # voir +iopath==0.1.10 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in + # fvcore +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +lightning-utilities==0.11.5 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore + # scipy + # torchmetrics + # torchvision + # xformers +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cudnn-cu12==8.9.2.26 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-nvjitlink-cu12==12.5.82 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in + # voir +ovld==0.3.5 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # lightning-utilities + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore + # torchvision +portalocker==2.10.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # iopath +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +pyyaml==6.0.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore + # omegaconf + # yacs +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +scipy==1.14.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in +six==1.16.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # asttokens +submitit==1.5.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +tabulate==0.9.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore +torch==2.3.1+cu121 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in + # torchmetrics + # torchvision + # xformers +torchmetrics==1.0.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in +torchvision==0.18.1+cu121 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in +tqdm==4.66.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore + # iopath +triton==2.3.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # iopath + # lightning-utilities + # reactivex + # submitit + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -c .pin/../constraints/cuda.txt + # -r benchmarks/dinov2/requirements.in +xformers==0.0.27 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/dinov2/requirements.in +yacs==0.1.8 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # fvcore + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/dinov2/requirements.in b/benchmarks/dinov2/requirements.in new file mode 100644 index 000000000..72ae21e22 --- /dev/null +++ b/benchmarks/dinov2/requirements.in @@ -0,0 +1,10 @@ +voir>=0.2.17,<0.3 +torch +xformers +torchvision +torchmetrics +omegaconf +fvcore +iopath +submitit +scipy \ No newline at end of file diff --git a/benchmarks/dinov2/voirfile.py b/benchmarks/dinov2/voirfile.py new file mode 100644 index 000000000..f358914dc --- /dev/null +++ b/benchmarks/dinov2/voirfile.py @@ -0,0 +1,100 @@ +from dataclasses import dataclass + +from voir.phase import StopProgram +from voir import configurable +from benchmate.observer import BenchObserver +from benchmate.monitor import voirfile_monitor + + +@dataclass +class Config: + """voir configuration""" + + # Whether to display the dash or not + dash: bool = False + + # How often to log the rates + interval: str = "1s" + + # Number of rates to skip before logging + skip: int = 5 + + # Number of rates to log before stopping + stop: int = 60 + + # Number of seconds between each gpu poll + gpu_poll: int = 3 + + +@configurable +def instrument_main(ov, options: Config): + yield ov.phases.init + + import os + import sys + sys.path.append(os.path.dirname(__file__) + "/src/") + + yield ov.phases.load_script + + # GPU monitor, rate, loss etc... + voirfile_monitor(ov, options) + + code_patch(ov) + + # + # Insert milabench tools + # + def batch_size(x): + return x["collated_global_crops"].shape[0] + + observer = BenchObserver( + earlystop=options.stop + options.skip, + batch_size_fn=batch_size, + ) + + probe = ov.probe("/dinov2.data.loaders/make_data_loader() as loader", overridable=True) + probe['loader'].override(observer.loader) + + probe = ov.probe("/dinov2.train.train/do_train > losses_reduced", overridable=True) + probe["losses_reduced"].override(observer.record_loss) + + probe = ov.probe("/dinov2.train.train/build_optimizer() as optimizer", overridable=True) + probe['optimizer'].override(observer.optimizer) + + # + # Run the benchmark + # + try: + yield ov.phases.run_script + except StopProgram: + print("early stopped") + + + +def code_patch(ov): + # FIX dinov2 code using ptera + import os + + from torchvision.datasets import ImageFolder + import torch + import dinov2.train.train + + class SSLMetaArch2(dinov2.train.train.SSLMetaArch): + def fsdp_synchronize_streams(self): + if self.need_to_synchronize_fsdp_streams: + torch.cuda.synchronize() + self.need_to_synchronize_fsdp_streams = False + + + dinov2.train.train.SSLMetaArch = SSLMetaArch2 + dinov2.train.ssl_meta_arch.reshard_fsdp_model = lambda *args: None + + probe = ov.probe("/dinov2.distributed/_is_slurm_job_process() as is_slrum", overridable=True) + probe['is_slrum'].override(lambda *args: False) + + def override_parsed_dataset(results): + class_, kwargs = results + return ImageFolder, {"root": os.path.join(kwargs["root"], "train")} + + probe = ov.probe("/dinov2.data.loaders/_parse_dataset_str() as dataset_kwargs", overridable=True) + probe['dataset_kwargs'].override(override_parsed_dataset) diff --git a/benchmarks/dlrm/requirements.cuda.txt b/benchmarks/dlrm/requirements.cuda.txt index eb3bf343d..6f80d9bec 100644 --- a/benchmarks/dlrm/requirements.cuda.txt +++ b/benchmarks/dlrm/requirements.cuda.txt @@ -341,7 +341,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/dlrm/requirements.hpu.txt b/benchmarks/dlrm/requirements.hpu.txt index 539fb0f9b..39134d4d0 100644 --- a/benchmarks/dlrm/requirements.hpu.txt +++ b/benchmarks/dlrm/requirements.hpu.txt @@ -340,7 +340,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/dlrm/requirements.rocm.txt b/benchmarks/dlrm/requirements.rocm.txt index 9714b7989..a6f19b5d6 100644 --- a/benchmarks/dlrm/requirements.rocm.txt +++ b/benchmarks/dlrm/requirements.rocm.txt @@ -289,7 +289,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/dlrm/requirements.xpu.txt b/benchmarks/dlrm/requirements.xpu.txt index 2edf5e00d..caf645af3 100644 --- a/benchmarks/dlrm/requirements.xpu.txt +++ b/benchmarks/dlrm/requirements.xpu.txt @@ -285,7 +285,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/flops/requirements.cuda.txt b/benchmarks/flops/requirements.cuda.txt index 91aee8272..8feccde84 100644 --- a/benchmarks/flops/requirements.cuda.txt +++ b/benchmarks/flops/requirements.cuda.txt @@ -202,7 +202,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/flops/requirements.hpu.txt b/benchmarks/flops/requirements.hpu.txt index cb07bf54e..8d32e877d 100644 --- a/benchmarks/flops/requirements.hpu.txt +++ b/benchmarks/flops/requirements.hpu.txt @@ -201,7 +201,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/flops/requirements.rocm.txt b/benchmarks/flops/requirements.rocm.txt index dd1c08962..40cf98b59 100644 --- a/benchmarks/flops/requirements.rocm.txt +++ b/benchmarks/flops/requirements.rocm.txt @@ -150,7 +150,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/flops/requirements.xpu.txt b/benchmarks/flops/requirements.xpu.txt index dffb8d45c..0f73c2b33 100644 --- a/benchmarks/flops/requirements.xpu.txt +++ b/benchmarks/flops/requirements.xpu.txt @@ -167,7 +167,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/huggingface/Makefile b/benchmarks/huggingface/Makefile new file mode 100644 index 000000000..b0a774b89 --- /dev/null +++ b/benchmarks/huggingface/Makefile @@ -0,0 +1,34 @@ +# Use global base if possible +ifndef MILABENCH_BASE + MILABENCH_BASE="base" +endif + +export MILABENCH_BASE + +BENCH_NAME=dinov2_large +MILABENCH_CONFIG=dev.yaml +MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE) + +all: + install prepare single gpus nodes + +install: + milabench install $(MILABENCH_ARGS) --force + +prepare: + milabench prepare $(MILABENCH_ARGS) + +tests: install prepare + milabench run $(MILABENCH_ARGS) + +debug: + CUDA_VISIBLE_DEVICES=0 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single + +single: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single + +gpus: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus + +nodes: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes diff --git a/benchmarks/huggingface/bench/__main__.py b/benchmarks/huggingface/bench/__main__.py index e19d3f9a2..f774afa97 100644 --- a/benchmarks/huggingface/bench/__main__.py +++ b/benchmarks/huggingface/bench/__main__.py @@ -41,6 +41,17 @@ def step(self, optimizer): def update(self): pass + +def make_dataloader(args, info): + data = SyntheticData( + n=args.batch_size, + repeat=100000, + generators=generators[info.category](info), + ) + return DataLoader( + data, batch_size=args.batch_size, num_workers=args.num_workers + ) + class Runner: def __init__(self, args): accelerator.set_enable_tf32(is_tf32_allowed(args)) @@ -50,22 +61,18 @@ def __init__(self, args): self.device = accelerator.fetch_device(0) self.batch_size = args.batch_size - info = models[args.model]() - self.model = info.model.to(self.device) + self.info = models[args.model](args) + self.model = self.info.model.to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) # this cause the bench to fail for one model (reformer) # dtype=float_dtype(args.precision) self.model, self.optimizer = accelerator.optimize(self.model, optimizer=self.optimizer) - self.data = SyntheticData( - n=args.batch_size, - repeat=100000, - generators=generators[info.category](info), - ) - self.loader = DataLoader( - self.data, batch_size=args.batch_size, num_workers=args.num_workers - ) + if hasattr(self.info, "dataloader"): + self.loader = self.info.dataloader + else: + self.loader = make_dataloader(args, self.info) self.amp_scaler = NoScale() if torch.cuda.is_available(): diff --git a/benchmarks/huggingface/bench/models.py b/benchmarks/huggingface/bench/models.py index 0af98414b..e36780e1c 100644 --- a/benchmarks/huggingface/bench/models.py +++ b/benchmarks/huggingface/bench/models.py @@ -15,8 +15,23 @@ def _make(category, config): return getattr(transformers, category).from_config(config) + +def synthetic_dataset(info, args): + from .synth import SyntheticData, generators + from torch.utils.data import DataLoader + + data = SyntheticData( + n=args.batch_size, + repeat=100000, + generators=generators[info.category](info), + ) + return DataLoader( + data, batch_size=args.batch_size, num_workers=args.num_workers + ) + + @register_model -def Opt350m(): +def Opt350m(args): category = "AutoModelForCausalLM" config = AutoConfig.from_pretrained("facebook/opt-350m") return NS( @@ -29,7 +44,7 @@ def Opt350m(): @register_model -def GPT2(): +def GPT2(args): category = "AutoModelForCausalLM" config = AutoConfig.from_pretrained("gpt2") return NS( @@ -42,7 +57,7 @@ def GPT2(): @register_model -def GPT2_large(): +def GPT2_large(args): category = "AutoModelForCausalLM" config = AutoConfig.from_pretrained("gpt2-large") return NS( @@ -55,7 +70,7 @@ def GPT2_large(): @register_model -def T5(): +def T5(args): category = "AutoModelForSeq2SeqLM" config = AutoConfig.from_pretrained("t5-small") return NS( @@ -68,7 +83,7 @@ def T5(): @register_model -def T5_base(): +def T5_base(args): category = "AutoModelForSeq2SeqLM" config = AutoConfig.from_pretrained("t5-base") return NS( @@ -81,7 +96,7 @@ def T5_base(): @register_model -def T5_large(): +def T5_large(args): category = "AutoModelForSeq2SeqLM" config = AutoConfig.from_pretrained("t5-large") return NS( @@ -94,7 +109,7 @@ def T5_large(): @register_model -def Bart(): +def Bart(args): category = "AutoModelForSeq2SeqLM" config = AutoConfig.from_pretrained("facebook/bart-base") return NS( @@ -107,7 +122,7 @@ def Bart(): @register_model -def Reformer(): +def Reformer(args): category = "AutoModelForMaskedLM" config = ReformerConfig() if not config.num_buckets: @@ -122,7 +137,7 @@ def Reformer(): @register_model -def BigBird(): +def BigBird(args): category = "AutoModelForMaskedLM" config = BigBirdConfig(attention_type="block_sparse") return NS( @@ -135,7 +150,7 @@ def BigBird(): @register_model -def Albert(): +def Albert(args): category = "AutoModelForMaskedLM" config = AutoConfig.from_pretrained("albert-base-v2") return NS( @@ -148,7 +163,7 @@ def Albert(): @register_model -def DistilBert(): +def DistilBert(args): category = "AutoModelForMaskedLM" config = AutoConfig.from_pretrained("distilbert-base-uncased") return NS( @@ -161,7 +176,7 @@ def DistilBert(): @register_model -def Longformer(): +def Longformer(args): category = "AutoModelForMaskedLM" config = AutoConfig.from_pretrained("allenai/longformer-base-4096") return NS( @@ -174,7 +189,7 @@ def Longformer(): @register_model -def Bert(): +def Bert(args): category = "AutoModelForMaskedLM" config = BertConfig() return NS( @@ -187,7 +202,7 @@ def Bert(): @register_model -def Bert_large(): +def Bert_large(args): category = "AutoModelForMaskedLM" config = BertConfig(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16) return NS( @@ -200,7 +215,7 @@ def Bert_large(): @register_model -def Whisper(): +def Whisper(args): category = "AutoModelForAudioClassification" config = AutoConfig.from_pretrained("openai/whisper-tiny") return NS( @@ -212,3 +227,112 @@ def Whisper(): extractor_class=transformers.WhisperFeatureExtractor, model=_make(category, config), ) + + +def dataset_ade20k(args, transform): + from datasets import load_dataset + from torch.utils.data import DataLoader + from collections import defaultdict + import torch + + dataset = load_dataset("helenlu/ade20k", trust_remote_code=True)["train"] + scenes = defaultdict(int) + + def collate_function(data): + # {'image': , 'conditioning_image': , 'text': 'bathroom'} + images = [] + conditioning_images = [] + texts = [] + labels = [] + + def get_label(txt): + # Note: this is wrong for a real usecase because the label would change + # depending on the shuffling + nonlocal scenes + + label = scenes.get(txt) + if label is None: + label = len(scenes) + scenes[text] = label + return label + + for items in data: + image = items["image"] + conditioning_image = items["conditioning_image"] + text = items["text"] + label = get_label(text) + + texts.append(text) + labels.append(label) + images.append(transform(image, return_tensors="pt")["pixel_values"]) + conditioning_images.append(transform(conditioning_image, return_tensors="pt")["pixel_values"]) + + return { + "pixel_values": torch.cat(images), + "conditioning_images": torch.cat(conditioning_images), + "labels": torch.tensor(labels, dtype=torch.long) + } + + loader = DataLoader( + dataset, + batch_size=args.batch_size, + num_workers=args.num_workers, + collate_fn=collate_function + ) + + for i in loader: + assert i['pixel_values'].shape == (args.batch_size, 3, 224, 224) + print(i['pixel_values'].shape) + print(i['conditioning_images'].shape) + print(i['labels'].shape) + break + + return loader + + +@register_model +def dinov2_large(args): + category = "AutoModel" + config = AutoConfig.from_pretrained("facebook/dinov2-large") + + + def criterion(model_output, dataloader_input): + mask = dataloader_input["conditioning_images"] + print(model_output) + return 0 + + processor = transformers.AutoImageProcessor.from_pretrained('facebook/dinov2-large') + return NS( + category=category, + config=config, + train_length=512, + eval_length=1024, + model=_make(category, config), + transform=processor, + dataloader = dataset_ade20k(args, processor), + model_inputs=lambda x: {"pixel_values": x["pixel_values"]}, + criterion=criterion + ) + + + + + + + +@register_model +def dinov20_giant(args): + category = "AutoModel" + config = transformers.Dinov2Config("facebook/dinov2-giant") + processor = transformers.AutoImageProcessor.from_pretrained('facebook/dinov2-large') + + return NS( + category=category, + config=config, + train_length=512, + eval_length=1024, + model=_make(category, config), + transform=processor, + dataloader=dataset_ade20k(args, processor), + model_inputs=lambda x: {"pixel_values": x["pixel_values"]} + ) diff --git a/benchmarks/huggingface/dev.yaml b/benchmarks/huggingface/dev.yaml new file mode 100644 index 000000000..d77a4bf74 --- /dev/null +++ b/benchmarks/huggingface/dev.yaml @@ -0,0 +1,37 @@ + + + +_huggingface: + inherits: _defaults + definition: . + install-variant: unpinned + install_group: torch + + argv: + --model: dinov2_large + +dinov2_large-single: + inherits: _huggingface + + num_machines: 1 + plan: + method: per_gpu + +dinov2_large-gpus: + inherits: _huggingface + + num_machines: 1 + plan: + method: njobs + n: 1 + +dinov2_large-nodes: + inherits: _huggingface + + num_machines: 2 + plan: + method: njobs + n: 1 + + requires_capabilities: + - "len(nodes) >= ${num_machines}" diff --git a/benchmarks/huggingface/prepare.py b/benchmarks/huggingface/prepare.py index d1bdaf280..1f5f80850 100755 --- a/benchmarks/huggingface/prepare.py +++ b/benchmarks/huggingface/prepare.py @@ -7,7 +7,7 @@ args = parser().parse_args() print(f"Preparing {args.model}") make_config = models[args.model] - make_config() + make_config(args) # bert dataset # t5 dataset diff --git a/benchmarks/huggingface/requirements.cuda.txt b/benchmarks/huggingface/requirements.cuda.txt index 40ab23769..7e577f680 100644 --- a/benchmarks/huggingface/requirements.cuda.txt +++ b/benchmarks/huggingface/requirements.cuda.txt @@ -152,6 +152,10 @@ packaging==24.1 # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub # transformers +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/huggingface/requirements.in psutil==5.9.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -238,7 +242,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/huggingface/requirements.hpu.txt b/benchmarks/huggingface/requirements.hpu.txt index 335c565d6..09452bf07 100644 --- a/benchmarks/huggingface/requirements.hpu.txt +++ b/benchmarks/huggingface/requirements.hpu.txt @@ -237,7 +237,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/huggingface/requirements.in b/benchmarks/huggingface/requirements.in index 85d2c3a0a..bd8338ab8 100644 --- a/benchmarks/huggingface/requirements.in +++ b/benchmarks/huggingface/requirements.in @@ -1,3 +1,4 @@ torch transformers voir +pillow diff --git a/benchmarks/huggingface/requirements.rocm.txt b/benchmarks/huggingface/requirements.rocm.txt index 73dbe77b7..ff17b478f 100644 --- a/benchmarks/huggingface/requirements.rocm.txt +++ b/benchmarks/huggingface/requirements.rocm.txt @@ -186,7 +186,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/huggingface/requirements.xpu.txt b/benchmarks/huggingface/requirements.xpu.txt index 1084c700c..c1806ada3 100644 --- a/benchmarks/huggingface/requirements.xpu.txt +++ b/benchmarks/huggingface/requirements.xpu.txt @@ -182,7 +182,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/lightning/benchfile.py b/benchmarks/lightning/benchfile.py index 09926711f..8e2a4cf81 100644 --- a/benchmarks/lightning/benchfile.py +++ b/benchmarks/lightning/benchfile.py @@ -7,12 +7,6 @@ class LightningBenchmark(Package): prepare_script = "prepare.py" main_script = "main.py" - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } - def build_run_plan(self): # self.config is not the right config for this plan = super().build_run_plan() diff --git a/benchmarks/lightning/main.py b/benchmarks/lightning/main.py index a83d3b48a..b31f3880c 100644 --- a/benchmarks/lightning/main.py +++ b/benchmarks/lightning/main.py @@ -33,6 +33,21 @@ def configure_optimizers(self): return optimizer + +def prepare_voir(): + from benchmate.observer import BenchObserver + from benchmate.monitor import bench_monitor + + observer = BenchObserver( + accelerator.Event, + earlystop=65, + batch_size_fn=lambda x: len(x[0]), + raise_stop_program=False, + stdout=True, + ) + + return observer, bench_monitor + def main(): parser = argparse.ArgumentParser(description='simple distributed training job') parser.add_argument( @@ -58,19 +73,12 @@ def main(): model = TorchvisionLightning(model) - dataset = imagenet_dataloader(args, model, rank, world_size) + - from benchmate.observer import BenchObserver - accelerator.set_enable_tf32(True) - observer = BenchObserver( - accelerator.Event, - earlystop=65, - batch_size_fn=lambda x: len(x[0]), - raise_stop_program=False, - stdout=True, - ) + observer, monitor = prepare_voir() + loader = observer.loader(imagenet_dataloader(args, model, rank, world_size)) # train model trainer = L.Trainer( @@ -85,7 +93,9 @@ def main(): reload_dataloaders_every_n_epochs=1, max_steps=100 ) - trainer.fit(model=model, train_dataloaders=observer.loader(dataset)) + + with monitor(): + trainer.fit(model=model, train_dataloaders=loader) print("finished: ", rank) diff --git a/benchmarks/lightning/requirements.cuda.txt b/benchmarks/lightning/requirements.cuda.txt index 7032b78ec..583a583d2 100644 --- a/benchmarks/lightning/requirements.cuda.txt +++ b/benchmarks/lightning/requirements.cuda.txt @@ -272,7 +272,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/llama/benchfile.py b/benchmarks/llama/benchfile.py index b7bc0032e..977e825f5 100644 --- a/benchmarks/llama/benchfile.py +++ b/benchmarks/llama/benchfile.py @@ -6,12 +6,6 @@ class LLAMA(Package): base_requirements = "requirements.in" main_script = "main.py" - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } - async def install(self): await super().install() diff --git a/benchmarks/llama/requirements.cuda.txt b/benchmarks/llama/requirements.cuda.txt index bd0e84db2..771e8d653 100644 --- a/benchmarks/llama/requirements.cuda.txt +++ b/benchmarks/llama/requirements.cuda.txt @@ -333,7 +333,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/llama/requirements.hpu.txt b/benchmarks/llama/requirements.hpu.txt index d00a6c54c..e13785223 100644 --- a/benchmarks/llama/requirements.hpu.txt +++ b/benchmarks/llama/requirements.hpu.txt @@ -332,7 +332,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/llama/requirements.rocm.txt b/benchmarks/llama/requirements.rocm.txt index 1e21d8476..df86f9eaf 100644 --- a/benchmarks/llama/requirements.rocm.txt +++ b/benchmarks/llama/requirements.rocm.txt @@ -281,7 +281,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/llama/requirements.xpu.txt b/benchmarks/llama/requirements.xpu.txt index 5d7223852..bd69f7e55 100644 --- a/benchmarks/llama/requirements.xpu.txt +++ b/benchmarks/llama/requirements.xpu.txt @@ -277,7 +277,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/rwkv/requirements.cuda.txt b/benchmarks/rwkv/requirements.cuda.txt index 495244706..82472b46f 100644 --- a/benchmarks/rwkv/requirements.cuda.txt +++ b/benchmarks/rwkv/requirements.cuda.txt @@ -266,7 +266,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via -r benchmarks/rwkv/requirements.in yarl==1.9.2 # via diff --git a/benchmarks/rwkv/requirements.hpu.txt b/benchmarks/rwkv/requirements.hpu.txt index d5b7ee978..6655c6d45 100644 --- a/benchmarks/rwkv/requirements.hpu.txt +++ b/benchmarks/rwkv/requirements.hpu.txt @@ -265,7 +265,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via -r milabench/benchmarks/rwkv/requirements.in yarl==1.9.4 # via diff --git a/benchmarks/rwkv/requirements.rocm.txt b/benchmarks/rwkv/requirements.rocm.txt index 25c68e2c0..2a1877032 100644 --- a/benchmarks/rwkv/requirements.rocm.txt +++ b/benchmarks/rwkv/requirements.rocm.txt @@ -214,7 +214,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via -r benchmarks/rwkv/requirements.in yarl==1.9.2 # via diff --git a/benchmarks/rwkv/requirements.xpu.txt b/benchmarks/rwkv/requirements.xpu.txt index 2fecadcd5..4f21c1a01 100644 --- a/benchmarks/rwkv/requirements.xpu.txt +++ b/benchmarks/rwkv/requirements.xpu.txt @@ -267,7 +267,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/stargan/requirements.cuda.txt b/benchmarks/stargan/requirements.cuda.txt index 9cb7eb695..01f4812bf 100644 --- a/benchmarks/stargan/requirements.cuda.txt +++ b/benchmarks/stargan/requirements.cuda.txt @@ -231,7 +231,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/stargan/requirements.hpu.txt b/benchmarks/stargan/requirements.hpu.txt index 07661a575..90f8f6131 100644 --- a/benchmarks/stargan/requirements.hpu.txt +++ b/benchmarks/stargan/requirements.hpu.txt @@ -230,7 +230,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/stargan/requirements.rocm.txt b/benchmarks/stargan/requirements.rocm.txt index b9b59daa9..545feace3 100644 --- a/benchmarks/stargan/requirements.rocm.txt +++ b/benchmarks/stargan/requirements.rocm.txt @@ -179,7 +179,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/stargan/requirements.xpu.txt b/benchmarks/stargan/requirements.xpu.txt index d74970aed..fc8e41942 100644 --- a/benchmarks/stargan/requirements.xpu.txt +++ b/benchmarks/stargan/requirements.xpu.txt @@ -177,7 +177,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/super-slomo/requirements.cuda.txt b/benchmarks/super-slomo/requirements.cuda.txt index 0cc1b2000..c4f5fda2d 100644 --- a/benchmarks/super-slomo/requirements.cuda.txt +++ b/benchmarks/super-slomo/requirements.cuda.txt @@ -199,7 +199,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/super-slomo/requirements.hpu.txt b/benchmarks/super-slomo/requirements.hpu.txt index f3a9430ee..4b81352dd 100644 --- a/benchmarks/super-slomo/requirements.hpu.txt +++ b/benchmarks/super-slomo/requirements.hpu.txt @@ -198,7 +198,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/super-slomo/requirements.rocm.txt b/benchmarks/super-slomo/requirements.rocm.txt index d5a8e913f..a157466d1 100644 --- a/benchmarks/super-slomo/requirements.rocm.txt +++ b/benchmarks/super-slomo/requirements.rocm.txt @@ -147,7 +147,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/super-slomo/requirements.xpu.txt b/benchmarks/super-slomo/requirements.xpu.txt index cbd3246f3..65c4ab94d 100644 --- a/benchmarks/super-slomo/requirements.xpu.txt +++ b/benchmarks/super-slomo/requirements.xpu.txt @@ -164,7 +164,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py index 94be19e6b..52a31ba1d 100644 --- a/benchmarks/timm/benchfile.py +++ b/benchmarks/timm/benchfile.py @@ -12,12 +12,6 @@ class TimmBenchmarkPack(Package): @property def working_directory(self): return self.dirs.code / "pytorch-image-models" - - def make_env(self): - return { - **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), - } @property def argv(self): diff --git a/benchmarks/timm/requirements.cuda.txt b/benchmarks/timm/requirements.cuda.txt index 6d10fab4e..d2eff2657 100644 --- a/benchmarks/timm/requirements.cuda.txt +++ b/benchmarks/timm/requirements.cuda.txt @@ -230,7 +230,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/timm/requirements.hpu.txt b/benchmarks/timm/requirements.hpu.txt index 42ec5ab89..c44d335cd 100644 --- a/benchmarks/timm/requirements.hpu.txt +++ b/benchmarks/timm/requirements.hpu.txt @@ -229,7 +229,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/timm/requirements.rocm.txt b/benchmarks/timm/requirements.rocm.txt index 6bfbef023..996a684ee 100644 --- a/benchmarks/timm/requirements.rocm.txt +++ b/benchmarks/timm/requirements.rocm.txt @@ -178,7 +178,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/timm/requirements.xpu.txt b/benchmarks/timm/requirements.xpu.txt index 9280b2fe6..78f3c4a00 100644 --- a/benchmarks/timm/requirements.xpu.txt +++ b/benchmarks/timm/requirements.xpu.txt @@ -176,7 +176,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/torchvision/requirements.cuda.txt b/benchmarks/torchvision/requirements.cuda.txt index ea33e86f2..496c06da7 100644 --- a/benchmarks/torchvision/requirements.cuda.txt +++ b/benchmarks/torchvision/requirements.cuda.txt @@ -202,7 +202,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/torchvision/requirements.hpu.txt b/benchmarks/torchvision/requirements.hpu.txt index 08f384f9c..6ff49f3e0 100644 --- a/benchmarks/torchvision/requirements.hpu.txt +++ b/benchmarks/torchvision/requirements.hpu.txt @@ -201,7 +201,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/torchvision/requirements.rocm.txt b/benchmarks/torchvision/requirements.rocm.txt index 01b9ff663..949948e40 100644 --- a/benchmarks/torchvision/requirements.rocm.txt +++ b/benchmarks/torchvision/requirements.rocm.txt @@ -150,7 +150,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/torchvision/requirements.xpu.txt b/benchmarks/torchvision/requirements.xpu.txt index 61a0b51e1..677f04f8b 100644 --- a/benchmarks/torchvision/requirements.xpu.txt +++ b/benchmarks/torchvision/requirements.xpu.txt @@ -167,7 +167,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmarks/torchvision_ddp/requirements.cuda.txt b/benchmarks/torchvision_ddp/requirements.cuda.txt index 68bacfebb..20b56dbed 100644 --- a/benchmarks/torchvision_ddp/requirements.cuda.txt +++ b/benchmarks/torchvision_ddp/requirements.cuda.txt @@ -202,7 +202,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/torchvision_ddp/requirements.rocm.txt b/benchmarks/torchvision_ddp/requirements.rocm.txt index 150ce9d63..790298ffb 100644 --- a/benchmarks/torchvision_ddp/requirements.rocm.txt +++ b/benchmarks/torchvision_ddp/requirements.rocm.txt @@ -150,7 +150,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/torchvision_ddp/requirements.xpu.txt b/benchmarks/torchvision_ddp/requirements.xpu.txt index 9dd665ed2..4465de4fe 100644 --- a/benchmarks/torchvision_ddp/requirements.xpu.txt +++ b/benchmarks/torchvision_ddp/requirements.xpu.txt @@ -167,7 +167,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.16 +voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmate/benchmate/datagen.py b/benchmate/benchmate/datagen.py index 0fee96a4d..a7a753099 100644 --- a/benchmate/benchmate/datagen.py +++ b/benchmate/benchmate/datagen.py @@ -85,10 +85,9 @@ def device_count(): return acc.device_count() except: return 1 + -def generate_fakeimagenet(): - # config = json.loads(os.environ["MILABENCH_CONFIG"]) - +def fakeimagenet_args(): parser = argparse.ArgumentParser() parser.add_argument("--batch-size", default=512, type=int) parser.add_argument("--batch-count", default=60, type=int) @@ -97,8 +96,15 @@ def generate_fakeimagenet(): parser.add_argument("--image-size", default=[3, 384, 384], type=int, nargs="+") parser.add_argument("--val", default=0.1, type=float, nargs="+") parser.add_argument("--test", default=0.1, type=float, nargs="+") - args, _ = parser.parse_known_args() + return args + + +def generate_fakeimagenet(args=None): + # config = json.loads(os.environ["MILABENCH_CONFIG"]) + + if args is None: + args = fakeimagenet_args() if overrides := os.getenv("MILABENCH_TESTING_PREPARE"): bs, bc = overrides.split(",") diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index a42a91fc3..dd8202ba1 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -121,6 +121,19 @@ def monogpu_monitor(*args, **kwargs): yield log + +@contextmanager +def bench_monitor(*args, **kwargs): + if int(os.getenv("RANK", -1)) == -1: + with monogpu_monitor(*args, **kwargs) as mon: + yield mon + + elif int(os.getenv("RANK", -1)) == 0: + with multigpu_monitor(*args, **kwargs) as mon: + yield mon + else: + yield + # # Legacy compatibility # @@ -135,3 +148,38 @@ def milabench_sys_monitor(monogpu=False): return setupvoir(monogpu) + +def get_rank(): + try: + return int(os.getenv("RANK", -1)) + except: + return -1 + + +def voirfile_monitor(ov, options): + from voir.instruments import early_stop, log, dash + + if options.dash: + ov.require(dash) + + instruments = [ + log( + "value", "progress", "rate", "units", "loss", "gpudata", context="task" + ) + ] + + rank = get_rank() + + # -1 & 0 early stop + if rank <= 0: + instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop")) + + # mono gpu if rank is not set + if rank == -1: + instruments.append(monitor_monogpu(poll_interval=options.gpu_poll)) + + # rank is set only monitor main rank + if rank == 0: + instruments.append(monitor_node(poll_interval=options.gpu_poll)) + + ov.require(*instruments) diff --git a/config/base.yaml b/config/base.yaml index e47a78648..1b50bdeaf 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -112,13 +112,6 @@ _timm: --dataset: "FakeImageNet" --workers: "auto({n_worker}, 8)" -_sb3: - inherits: _defaults - definition: ../benchmarks/stable_baselines3 - group: sb3 - plan: - method: njobs - n: 1 _accelerate_opt: inherits: _defaults @@ -226,7 +219,7 @@ resnet50-noio: --batch-size: 256 --loader: synthetic_fixed -resnet152-ddp: +resnet152-ddp-gpus: inherits: _torchvision_ddp tags: - vision @@ -391,7 +384,7 @@ resnet152: --model: resnet152 --batch-size: 256 -resnet152-multi: +resnet152-gpus: inherits: resnet152 tags: - multigpu @@ -427,7 +420,7 @@ davit_large: --batch-size: 128 --lr-base: 0.01 -davit_large-multi: +davit_large-gpus: inherits: davit_large tags: - multigpu @@ -446,7 +439,7 @@ focalnet: argv: --model: focalnet_base_lrf -opt-1_3b: +opt-1_3b-gpus: inherits: _accelerate_opt tags: - multigpu @@ -458,8 +451,8 @@ opt-1_3b: use_deepspeed: false num_machines: 1 -opt-1_3b-multinode: - inherits: opt-1_3b +opt-1_3b-nodes: + inherits: opt-1_3b-gpus tags: - multinode @@ -469,7 +462,7 @@ opt-1_3b-multinode: docker_image: "ghcr.io/mila-iqia/milabench:cuda-nightly" num_machines: 2 -opt-6_7b: +opt-6_7b-gpus: inherits: _accelerate_opt tags: - multigpu @@ -480,8 +473,8 @@ opt-6_7b: num_machines: 1 -opt-6_7b-multinode: - inherits: opt-6_7b +opt-6_7b-nodes: + inherits: opt-6_7b-gpus tags: - multinode @@ -533,40 +526,6 @@ super-slomo: --loader: pytorch --num_workers: "auto({n_worker}, 8)" -ppo: - inherits: _sb3 - tags: - - rl - - argv: - --algo: ppo - --env: HalfCheetahBulletEnv-v0 - -n: '-1' - --num-threads: '-1' - --seed: '0' - --vec-env: subproc - --device: auto - --: [-params, n_envs:16, n_steps:512, n_epochs:20, n_timesteps:50000] - -td3: - inherits: _sb3 - tags: - - rl - - argv: - --algo: td3 - --env: HalfCheetahBulletEnv-v0 # Default: CartPole-v1 - --n-eval-envs: '1' - --n-timesteps: '50000' # Default: '-1' - --num-threads: '-1' - --log-interval: '-1' - --eval-episodes: '5' - --save-freq: '-1' - --seed: '0' # Default: -1 - --vec-env: subproc # Default: dummy - --device: auto - --n-trials: '10' # Default: 500 - --n-jobs: '1' dlrm: inherits: _defaults @@ -658,8 +617,9 @@ _diffusion: install_group: torch argv: - --train_batch_size: 32 --num_epochs: 5 + --batch_size: 32 + --num_workers: "auto({n_worker}, 8)" diffusion-gpus: inherits: _diffusion @@ -680,6 +640,7 @@ _lightning: --loader: pytorch --data: "{milabench_data}/FakeImageNet" --model: resnet152 + --batch-size: 16 lightning: inherits: _lightning @@ -693,3 +654,26 @@ lightning-gpus: plan: method: njobs n: 1 + +_dinov2: + inherits: _defaults + definition: ../benchmarks/dinov2 + install_group: torch + plan: + method: njobs + n: 1 + + argv: + --output-dir: "{milabench_extra}/output" + --no-resume: true + + +dinov2-giant-gpus: + inherits: _dinov2 + argv: + --config-file: src/dinov2/configs/train/vitg14.yaml + # THOSE NEED TO BE LAST + train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true + train.batch_size_per_gpu=32: true + train.saveckp_freq=100: true + train.num_workers=10: true diff --git a/config/retired.yaml b/config/retired.yaml new file mode 100644 index 000000000..a88983b38 --- /dev/null +++ b/config/retired.yaml @@ -0,0 +1,42 @@ +_sb3: + inherits: _defaults + definition: ../benchmarks/stable_baselines3 + group: sb3 + plan: + method: njobs + n: 1 + +ppo: + inherits: _sb3 + tags: + - rl + + argv: + --algo: ppo + --env: HalfCheetahBulletEnv-v0 + -n: '-1' + --num-threads: '-1' + --seed: '0' + --vec-env: subproc + --device: auto + --: [-params, n_envs:16, n_steps:512, n_epochs:20, n_timesteps:50000] + +td3: + inherits: _sb3 + tags: + - rl + + argv: + --algo: td3 + --env: HalfCheetahBulletEnv-v0 # Default: CartPole-v1 + --n-eval-envs: '1' + --n-timesteps: '50000' # Default: '-1' + --num-threads: '-1' + --log-interval: '-1' + --eval-episodes: '5' + --save-freq: '-1' + --seed: '0' # Default: -1 + --vec-env: subproc # Default: dummy + --device: auto + --n-trials: '10' # Default: 500 + --n-jobs: '1' diff --git a/config/scaling.yaml b/config/scaling.yaml index 21b1e47bf..c6cf1bf6c 100644 --- a/config/scaling.yaml +++ b/config/scaling.yaml @@ -354,3 +354,28 @@ whisper: 128: 71634.375 MiB 144: 80412.75 MiB optimized: 128 + + +diffusion-gpus: + arg: --batch_size + model: + 1: 23082 MiB + 16: 37778 MiB + 32: 57808 MiB + 48: 80698 MiB + optimized: 32 + + +lightning-gpus: + arg: --batch-size + model: + 1: 4542 MiB + 16: 5692 MiB + 128: 15858 MiB + optimized: 16 + +dinov2-giant-gpus: + arg: train.batch_size_per_gpu={batch_size} + model: + 32: 69614 MiB + optimized: 32 diff --git a/config/standard.yaml b/config/standard.yaml index 3f43a8055..d7f55c9f0 100644 --- a/config/standard.yaml +++ b/config/standard.yaml @@ -77,7 +77,7 @@ resnet152: enabled: true weight: 1.0 -resnet152-multi: +resnet152-gpus: enabled: true weight: 5.0 @@ -89,23 +89,23 @@ davit_large: enabled: true weight: 1.0 -davit_large-multi: +davit_large-gpus: enabled: true weight: 5.0 -opt-1_3b: +opt-1_3b-gpus: enabled: true weight: 5.0 -opt-1_3b-multinode: +opt-1_3b-nodes: enabled: true weight: 10.0 -opt-6_7b: +opt-6_7b-gpus: enabled: true weight: 5.0 -opt-6_7b-multinode: +opt-6_7b-nodes: enabled: true weight: 10.0 @@ -151,10 +151,4 @@ brax: ################## # Disabled tests # -################## - -ppo: - enabled: false - -td3: - enabled: false +################## \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index 2b0595841..d8ae9287b 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-36-g67047d6f" -__commit__ = "67047d6f4634ac37ad861b53e35afb088a05dcc2" -__date__ = "2024-07-23 13:58:00 -0400" +__tag__ = "v0.1.0-30-g64aa548b" +__commit__ = "64aa548ba07d3c6bb298e435b8ac43c69eb75738" +__date__ = "2024-07-26 13:07:25 -0400" diff --git a/milabench/cli/dry.py b/milabench/cli/dry.py index dabc5a1de..80d55d6ea 100644 --- a/milabench/cli/dry.py +++ b/milabench/cli/dry.py @@ -134,6 +134,7 @@ class Arguments: capacity: int = 80000 withenv: bool = True usevoir: bool = False + ncpu: int = 256 # fmt: on @@ -142,13 +143,14 @@ def arguments(): ngpu: Option & int = 8 capacity: Option & int = 80000 nnodes: Option & int = 2 + ncpu: Option & int = 256 # [negate] withenv: Option & bool = True # [negate] usevoir: Option & bool = True - return Arguments(nnodes, ngpu, capacity, withenv, usevoir) + return Arguments(nnodes, ngpu, capacity, withenv, usevoir, ncpu) @tooled @@ -180,6 +182,18 @@ def multipack_args(conf: Arguments): return args +@contextmanager +def with_env(**kwargs): + for k, v in kwargs.items(): + if v: + os.environ[k] = str(v) + + yield + + for k, v in kwargs.items(): + if v: + del os.environ[k] + @tooled def cli_dry(args=None): @@ -192,7 +206,7 @@ def cli_dry(args=None): args = arguments() with disable_voir(enabled=False), enable_offline(enabled=True): - with assume_gpu(args.ngpu, args.capacity, enabled=True): + with assume_gpu(args.ngpu, args.capacity, enabled=True), with_env(MILABENCH_CPU_TOTAL_COUNT=args.ncpu): repeat = 1 mp = get_multipack(multipack_args(args), run_name="dev") gen = BashGenerator() diff --git a/milabench/metadata.py b/milabench/metadata.py index 3aae4d612..eba6891b1 100644 --- a/milabench/metadata.py +++ b/milabench/metadata.py @@ -26,7 +26,7 @@ def _get_gpu_info(): def fetch_torch_version(pack): cwd = pack.dirs.code exec_env = pack.full_env(dict()) - + result = subprocess.run( [str(x) for x in ["python", torchversion.__file__]], env=exec_env, diff --git a/milabench/pack.py b/milabench/pack.py index 214b4c7e1..60a5df2f7 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -329,11 +329,15 @@ def make_env(self): "MILABENCH_CONFIG": json.dumps(self.config), } """ + from .sizer import resolve_placeholder + env = { f"MILABENCH_DIR_{name.upper()}": path for name, path in self.config["dirs"].items() } + env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}") + env["MILABENCH_CONFIG"] = json.dumps(self.config) if self.phase == "prepare" or self.phase == "run": # XDG_CACHE_HOME controls basically all caches (pip, torch, huggingface, diff --git a/milabench/sizer.py b/milabench/sizer.py index cdcb57695..bc88e355a 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -172,7 +172,7 @@ def argv(self, benchmark, capacity, argv): return argv newsize = self.size(benchmark, capacity) - + if newsize is None: return argv @@ -181,7 +181,24 @@ def argv(self, benchmark, capacity, argv): argname = config.get("arg") if argname is None: return argv + + # placeholder replace + # train.batch_size_per_gpu={batch_size} + placeholder = "{batch_size}" + if placeholder in argname: + newval = argname.format(batch_size=str(newsize)) + + for i, arg in enumerate(argv): + if str(arg).startswith(argname[0:-len(placeholder)]): + break + else: + return argv + [newval] + + argv[i] = newval + return argv + # positional argument replace + # --argname {batch_size} for i, arg in enumerate(argv): if str(arg).endswith(argname): break @@ -342,7 +359,7 @@ def auto(value, default): def clamp(x, mn=options.cpu_min, mx=options.cpu_max): return min(max(x, mn), mx) - total_cpu = multiprocessing.cpu_count() + total_cpu = options.total_count or multiprocessing.cpu_count() total_available = total_cpu - options.reserved_cores context["cpu_count"] = total_available @@ -367,6 +384,11 @@ def auto_eval(arg): return auto_eval +def resolve_placeholder(pack, value): + resolver = new_argument_resolver(pack) + return resolver(value) + + def resolve_argv(pack, argv): resolver = new_argument_resolver(pack) argv = list(argv) diff --git a/milabench/system.py b/milabench/system.py index 35776946f..7db61e5ea 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -93,6 +93,10 @@ def option(name, etype, default=None): return None +def defaultfield(name, type, default=None): + return field(default_factory=lambda: option(name, type, default)) + + def is_autoscale_enabled(): return option("sizer.auto", int, 0) > 0 @@ -105,68 +109,70 @@ def default_save_location(): @dataclass class SizerOptions: # overrides the batch size to use for all benchmarks - size: int = option("sizer.batch_size", int) + size: int = defaultfield("sizer.batch_size", int, None) # Enables auto batch resize - autoscale: bool = option("sizer.auto", int, 0) + autoscale: bool = defaultfield("sizer.auto", int, 0) # Constraint the batch size to be a multiple of a number - multiple: int = option("sizer.multiple", int, 8) + multiple: int = defaultfield("sizer.multiple", int, 8) # Constraint the batch size to be a power of a specified base (usually 2) - power: int = option("sizer.power", int) + power: int = defaultfield("sizer.power", int) # Use the optimized batch size - optimized: bool = option("sizer.optimized", int) + optimized: bool = defaultfield("sizer.optimized", int) # Set a target VRAM capacity to use - capacity: str = option("sizer.capacity", str) + capacity: str = defaultfield("sizer.capacity", str) # Save the batch size, VRM usage data to a scaling file - save: str = option("sizer.save", str, None) + save: str = defaultfield("sizer.save", str, None) @dataclass class CPUOptions: - enabled: bool = option("cpu.auto", bool, False) + enabled: bool = defaultfield("cpu.auto", bool, False) + + total_count: bool = defaultfield("cpu.total_count", int, None) # max number of CPU per GPU - cpu_max: int = option("cpu.max", int, 16) + cpu_max: int = defaultfield("cpu.max", int, 16) # min number of CPU per GPU - cpu_min: int = option("cpu.min", int, 2) + cpu_min: int = defaultfield("cpu.min", int, 2) # reserved CPU cores (i.e not available for the benchmark) - reserved_cores: int = option("cpu.reserved_cores", int, 0) + reserved_cores: int = defaultfield("cpu.reserved_cores", int, 0) # Number of workers (ignores cpu_max and cpu_min) - n_workers: int = option("cpu.n_workers", int) + n_workers: int = defaultfield("cpu.n_workers", int) @dataclass class DatasetConfig: # If use buffer is true then datasets are copied to the buffer before running the benchmark - use_buffer: bool = option("data.use_buffer", bool, default=False) + use_buffer: bool = defaultfield("data.use_buffer", bool, default=False) # buffer location to copy the datasets bfore running the benchmarks - buffer: str = option("data.buffer", str, default="${dirs.base}/buffer") + buffer: str = defaultfield("data.buffer", str, default="${dirs.base}/buffer") @dataclass class Dirs: """Common directories used by milabench. This can be used to override location in case compute node do not have internet access.""" - venv: str = option("dirs.venv", str, default="${dirs.base}/venv/${install_group}") - data: str = option("dirs.data", str, default="${dirs.base}/data") - runs: str = option("dirs.runs", str, default="${dirs.base}/runs") - extra: str = option("dirs.extra", str, default="${dirs.base}/extra/${group}") - cache: str = option("dirs.cache", str, default="${dirs.base}/cache") + venv: str = defaultfield("dirs.venv", str, default="${dirs.base}/venv/${install_group}") + data: str = defaultfield("dirs.data", str, default="${dirs.base}/data") + runs: str = defaultfield("dirs.runs", str, default="${dirs.base}/runs") + extra: str = defaultfield("dirs.extra", str, default="${dirs.base}/extra/${group}") + cache: str = defaultfield("dirs.cache", str, default="${dirs.base}/cache") @dataclass class Torchrun: - port: int = option("torchrun.port", int, default=29400) - backend: str = option("torchrun.backend", str, default="c10d") + port: int = defaultfield("torchrun.port", int, default=29400) + backend: str = defaultfield("torchrun.backend", str, default="c10d") @dataclass @@ -180,6 +186,7 @@ class Options: @dataclass class GPUConfig: + arch: str = defaultfield("gpu.arch", str, None) capacity: str = None @@ -194,23 +201,23 @@ class Nodes: @dataclass class Github: - pat: str = option("github.path", str, None) + pat: str = defaultfield("github.path", str, None) @dataclass class SystemConfig: """This is meant to be an exhaustive list of all the environment overrides""" - arch: str = getenv("MILABENCH_GPU_ARCH", str) + arch: str = defaultfield("gpu.arch", str, None) sshkey: str = None docker_image: str = None nodes: list[Nodes] = field(default_factory=list) gpu: GPUConfig = None options: Options = None - base: str = option("base", str, None) - config: str = option("config", str, None) - dash: bool = option("dash", bool, 1) - noterm: bool = option("noterm", bool, 0) + base: str = defaultfield("base", str, None) + config: str = defaultfield("config", str, None) + dash: bool = defaultfield("dash", bool, 1) + noterm: bool = defaultfield("noterm", bool, 0) github: Github = None diff --git a/poetry.lock b/poetry.lock index dae2f20fd..ec0f16753 100644 --- a/poetry.lock +++ b/poetry.lock @@ -718,13 +718,13 @@ files = [ [[package]] name = "importlib-metadata" -version = "8.1.0" +version = "8.2.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-8.1.0-py3-none-any.whl", hash = "sha256:3cd29f739ed65973840b068e3132135ce954c254d48b5b640484467ef7ab3c8c"}, - {file = "importlib_metadata-8.1.0.tar.gz", hash = "sha256:fcdcb1d5ead7bdf3dd32657bb94ebe9d2aabfe89a19782ddc32da5041d6ebfb4"}, + {file = "importlib_metadata-8.2.0-py3-none-any.whl", hash = "sha256:11901fa0c2f97919b288679932bb64febaeacf289d18ac84dd68cb2e74213369"}, + {file = "importlib_metadata-8.2.0.tar.gz", hash = "sha256:72e8d4399996132204f9a16dcc751af254a48f8d1b20b9ff0f98d4a8f901e73d"}, ] [package.dependencies] @@ -2129,13 +2129,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [[package]] name = "voir" -version = "0.2.16" +version = "0.2.17" description = "Instrument, extend and visualize your programs" optional = false python-versions = "<4.0,>=3.7" files = [ - {file = "voir-0.2.16-py3-none-any.whl", hash = "sha256:d0beee6778e4d37f6c087362f55baa526b286399b509443b547ca3844332808c"}, - {file = "voir-0.2.16.tar.gz", hash = "sha256:73feda0b941e9247ca333611b9a8207ab6d154b2f6c5cf34f25770204aef9b1e"}, + {file = "voir-0.2.17-py3-none-any.whl", hash = "sha256:238aa6a5855aae389648880fb42aa4105a658368663de51d7bc1514890ee70ae"}, + {file = "voir-0.2.17.tar.gz", hash = "sha256:dcb1daa820ff8b12df8298d25d4f788d8580fe9eac3d80adad82aa157a4a1ac6"}, ] [package.dependencies] @@ -2190,4 +2190,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "ac862de7cec59d54e21cdc3280c72810aa15b88da6e9b6e67fb03d7e89228177" +content-hash = "59901f6d97314b2a67cac2cf9c4300cb5bde2feba01b0198b20c8ac477adae05" diff --git a/pyproject.toml b/pyproject.toml index 6695d6440..6a1693bf6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ authors = [ license = "MIT" [tool.poetry.dependencies] -voir = ">=0.2.14" +voir = ">=0.2.17" benchmate = {path = "benchmate", develop = false} python = ">=3.10,<4.0" giving = "^0.4.0" diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index 1144386c6..1a0ef0520 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -43,9 +43,9 @@ install_prepare() { fi if [ ! -d "$MILABENCH_WORDIR/voir" ]; then - - git clone https://github.com/Delaunay/voir.git -b patch-4 - git clone https://github.com/Delaunay/torchcompat.git + echo "" + # git clone https://github.com/Delaunay/voir.git + # git clone https://github.com/Delaunay/torchcompat.git fi . $MILABENCH_WORDIR/env/bin/activate @@ -59,20 +59,22 @@ install_prepare() { milabench install "$@" which pip - pip install -e $MILABENCH_WORDIR/voir - pip install -e $MILABENCH_WORDIR/torchcompat + # pip install -e $MILABENCH_WORDIR/voir + # pip install -e $MILABENCH_WORDIR/torchcompat ( . $BENCHMARK_VENV/bin/activate which pip - pip install -e $MILABENCH_WORDIR/voir - pip install -e $MILABENCH_WORDIR/torchcompat - pip install torch torchvision torchaudio + #pip install -e $MILABENCH_WORDIR/voir + # pip install -e $MILABENCH_WORDIR/torchcompat + # pip install torch torchvision torchaudio + + # pip install fvcore xFormers # DALI stuff - pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-cuda120 - pip install nvidia-pyindex - pip install nvidia-nvjpeg-cu12 + # pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-cuda120 + # pip install nvidia-pyindex + # pip install nvidia-nvjpeg-cu12 ) # @@ -82,28 +84,16 @@ install_prepare() { module load cuda/12.3.2 -if [ ! -d "$MILABENCH_WORDIR" ]; then +if [ ! -d "$MILABENCH_WORDIR/results/venv/torch" ]; then install_prepare else echo "Reusing previous install" . $MILABENCH_WORDIR/env/bin/activate fi - - -( - . $MILABENCH_WORDIR/env/bin/activate - pip install -e $MILABENCH_WORDIR/voir - pip install -e $MILABENCH_SOURCE/benchmate - - . $BENCHMARK_VENV/bin/activate - pip install -e $MILABENCH_WORDIR/voir - pip install -e $MILABENCH_SOURCE/benchmate -) - if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR - + # # Run the benchmakrs milabench run "$@" diff --git a/tests/test_command_reg.py b/tests/test_command_reg.py index 2a5699b4c..ece2abae1 100644 --- a/tests/test_command_reg.py +++ b/tests/test_command_reg.py @@ -27,9 +27,9 @@ def test_command_reg_one_node(set_reg_env, tmp_path, capsys, file_regression): args.ngpu = 8 args.capacity = 80000 args.nnodes = 1 + args.ncpu = 4 cli_dry(args) - compare(str(tmp_path), capsys, file_regression) @@ -38,6 +38,7 @@ def test_command_reg_two_nodes(set_reg_env, tmp_path, capsys, file_regression): args.ngpu = 8 args.capacity = 80000 args.nnodes = 2 + args.ncpu = 4 cli_dry(args) diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt index 35f198150..0be96ab18 100644 --- a/tests/test_command_reg/test_command_reg_one_node.txt +++ b/tests/test_command_reg/test_command_reg_one_node.txt @@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data export MILABENCH_DIR_RUNS=$BASE/runs export MILABENCH_DIR_EXTRA=$BASE/extra/llm export MILABENCH_DIR_CACHE=$BASE/cache +export OMP_NUM_THREADS=4 export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' -export OMP_NUM_THREADS=8 echo "---" echo "llama" @@ -124,8 +124,8 @@ time ( ) echo "---" -echo "resnet152-ddp" -echo "=============" +echo "resnet152-ddp-gpus" +echo "==================" time ( $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait @@ -327,10 +327,10 @@ time ( ) echo "---" -echo "resnet152-multi" -echo "===============" +echo "resnet152-gpus" +echo "==============" time ( - $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-multi.0 --checkpoint-hist 1 & + $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-gpus.0 --checkpoint-hist 1 & wait ) @@ -350,10 +350,10 @@ time ( ) echo "---" -echo "davit_large-multi" -echo "=================" +echo "davit_large-gpus" +echo "================" time ( - $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-multi.0 --checkpoint-hist 1 & + $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-gpus.0 --checkpoint-hist 1 & wait ) @@ -373,32 +373,32 @@ time ( ) echo "---" -echo "opt-1_3b" -echo "========" +echo "opt-1_3b-gpus" +echo "=============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-1.3b & wait ) echo "---" -echo "opt-1_3b-multinode" -echo "==================" +echo "opt-1_3b-nodes" +echo "==============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-1.3b & wait ) echo "---" -echo "opt-6_7b" -echo "========" +echo "opt-6_7b-gpus" +echo "=============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-6.7b & wait ) echo "---" -echo "opt-6_7b-multinode" -echo "==================" +echo "opt-6_7b-nodes" +echo "==============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-6.7b & wait @@ -454,7 +454,7 @@ echo "---" echo "diffusion-gpus" echo "==============" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --train_batch_size 32 --num_epochs 5 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & wait ) @@ -462,14 +462,14 @@ echo "---" echo "lightning" echo "=========" time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & wait ) @@ -477,7 +477,15 @@ echo "---" echo "lightning-gpus" echo "==============" time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + wait +) + +echo "---" +echo "dinov2-giant-gpus" +echo "=================" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & wait ) diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt index 2817f77f8..6f51b4e93 100644 --- a/tests/test_command_reg/test_command_reg_two_nodes.txt +++ b/tests/test_command_reg/test_command_reg_two_nodes.txt @@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data export MILABENCH_DIR_RUNS=$BASE/runs export MILABENCH_DIR_EXTRA=$BASE/extra/llm export MILABENCH_DIR_CACHE=$BASE/cache +export OMP_NUM_THREADS=4 export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}, {"ip": "192.168.0.11", "main": false, "name": "1", "port": 22, "user": "username", "hostname": "192.168.0.11", "aliaslist": [], "ipaddrlist": ["192.168.0.11"], "local": false}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' -export OMP_NUM_THREADS=8 echo "---" echo "llama" @@ -124,8 +124,8 @@ time ( ) echo "---" -echo "resnet152-ddp" -echo "=============" +echo "resnet152-ddp-gpus" +echo "==================" time ( $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait @@ -327,10 +327,10 @@ time ( ) echo "---" -echo "resnet152-multi" -echo "===============" +echo "resnet152-gpus" +echo "==============" time ( - $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-multi.0 --checkpoint-hist 1 & + $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-gpus.0 --checkpoint-hist 1 & wait ) @@ -350,10 +350,10 @@ time ( ) echo "---" -echo "davit_large-multi" -echo "=================" +echo "davit_large-gpus" +echo "================" time ( - $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-multi.0 --checkpoint-hist 1 & + $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-gpus.0 --checkpoint-hist 1 & wait ) @@ -373,16 +373,16 @@ time ( ) echo "---" -echo "opt-1_3b" -echo "========" +echo "opt-1_3b-gpus" +echo "=============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-1.3b & wait ) echo "---" -echo "opt-1_3b-multinode" -echo "==================" +echo "opt-1_3b-nodes" +echo "==============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 & ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 & @@ -390,16 +390,16 @@ time ( ) echo "---" -echo "opt-6_7b" -echo "========" +echo "opt-6_7b-gpus" +echo "=============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-6.7b & wait ) echo "---" -echo "opt-6_7b-multinode" -echo "==================" +echo "opt-6_7b-nodes" +echo "==============" time ( $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 & ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 & @@ -456,7 +456,7 @@ echo "---" echo "diffusion-gpus" echo "==============" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --train_batch_size 32 --num_epochs 5 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & wait ) @@ -464,14 +464,14 @@ echo "---" echo "lightning" echo "=========" time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & wait ) @@ -479,7 +479,15 @@ echo "---" echo "lightning-gpus" echo "==============" time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 & + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + wait +) + +echo "---" +echo "dinov2-giant-gpus" +echo "=================" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & wait ) diff --git a/tests/test_mock/test_milabench_bad_run.txt b/tests/test_mock/test_milabench_bad_run.txt index f30881ec1..e9b4ffa42 100644 --- a/tests/test_mock/test_milabench_bad_run.txt +++ b/tests/test_mock/test_milabench_bad_run.txt @@ -13,7 +13,7 @@ benchio.0 | Traceback (most recent call last): | File "$TMP/venv/benchio/bin/voir", line 8, in | sys.exit(main()) - | File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 124, in main + | File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 128, in main | ov(sys.argv[1:] if argv is None else argv) | File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/phase.py", line 331, in __call__ | self._run(*args, **kwargs) @@ -35,7 +35,7 @@ benchio.1 | Traceback (most recent call last): | File "$TMP/venv/benchio/bin/voir", line 8, in | sys.exit(main()) - | File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 124, in main + | File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 128, in main | ov(sys.argv[1:] if argv is None else argv) | File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/phase.py", line 331, in __call__ | self._run(*args, **kwargs)