diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt
index 8efdeccc2..15343ce73 100644
--- a/.pin/constraints-cuda-torch.txt
+++ b/.pin/constraints-cuda-torch.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
+#    pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
@@ -58,7 +58,9 @@ chex==0.1.86
 click==8.1.7
     # via flask
 cloudpickle==3.0.0
-    # via gym
+    # via
+    #   gym
+    #   submitit
 codefind==0.1.6
     # via ptera
 contextlib2==21.6.0
@@ -137,6 +139,8 @@ fsspec[http]==2024.5.0
     #   torchx
 future==1.0.0
     # via -r benchmarks/dlrm/requirements.in
+fvcore==0.1.5.post20221221
+    # via -r benchmarks/dinov2/requirements.in
 gdown==5.2.0
     # via -r benchmarks/stargan/requirements.in
 giving==0.4.2
@@ -181,6 +185,10 @@ importlib-resources==6.4.0
     #   argklass
     #   etils
     #   torchcompat
+iopath==0.1.10
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
 itsdangerous==2.2.0
     # via flask
 jax[cuda12]==0.4.28
@@ -283,6 +291,7 @@ numpy==1.26.4
     #   fairscale
     #   fbgemm-gpu
     #   flax
+    #   fvcore
     #   gym
     #   jax
     #   jaxlib
@@ -307,6 +316,7 @@ numpy==1.26.4
     #   torchvision
     #   transformers
     #   trimesh
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
     #   jax
@@ -358,7 +368,9 @@ nvidia-nvjitlink-cu12==12.5.82
 nvidia-nvtx-cu12==12.1.105
     # via torch
 omegaconf==2.3.0
-    # via voir
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   voir
 onnx==1.16.1
     # via -r benchmarks/dlrm/requirements.in
 opencv-python==4.10.0.84
@@ -397,7 +409,10 @@ pillow==10.4.0
     #   -r benchmarks/huggingface/requirements.in
     #   brax
     #   diffusers
+    #   fvcore
     #   torchvision
+portalocker==2.10.1
+    # via iopath
 protobuf==4.25.3
     # via
     #   onnx
@@ -449,6 +464,7 @@ pyyaml==6.0.1
     #   accelerate
     #   datasets
     #   flax
+    #   fvcore
     #   huggingface-hub
     #   lightning
     #   ml-collections
@@ -457,6 +473,7 @@ pyyaml==6.0.1
     #   pytorch-lightning
     #   torchx
     #   transformers
+    #   yacs
 reactivex==4.0.4
     # via giving
 regex==2024.5.15
@@ -487,6 +504,7 @@ scikit-learn==1.5.1
     # via -r benchmarks/dlrm/requirements.in
 scipy==1.14.0
     # via
+    #   -r benchmarks/dinov2/requirements.in
     #   brax
     #   jax
     #   jaxlib
@@ -504,10 +522,14 @@ six==1.16.0
     #   tensorboard
 soupsieve==2.5
     # via beautifulsoup4
+submitit==1.5.1
+    # via -r benchmarks/dinov2/requirements.in
 sympy==1.13.1
     # via torch
 tabulate==0.9.0
-    # via torchx
+    # via
+    #   fvcore
+    #   torchx
 tensorboard==2.17.0
     # via -r benchmarks/dlrm/requirements.in
 tensorboard-data-server==0.7.2
@@ -519,7 +541,9 @@ tensorstore==0.1.63
     #   flax
     #   orbax-checkpoint
 termcolor==2.4.0
-    # via fire
+    # via
+    #   fire
+    #   fvcore
 threadpoolctl==3.5.0
     # via scikit-learn
 tokenizers==0.19.1
@@ -530,6 +554,7 @@ torch==2.3.1+cu121
     # via
     #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/dlrm/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/huggingface/requirements.in
@@ -550,6 +575,7 @@ torch==2.3.1+cu121
     #   torchmetrics
     #   torchvision
     #   torchviz
+    #   xformers
 torchaudio==2.3.1+cu121
     # via -r benchmarks/accelerate_opt/requirements.in
 torchcompat==1.1.4
@@ -561,6 +587,7 @@ torchcompat==1.1.4
     #   -r benchmarks/torchvision_ddp/requirements.in
 torchmetrics==1.0.3
     # via
+    #   -r benchmarks/dinov2/requirements.in
     #   lightning
     #   pytorch-lightning
     #   torchrec
@@ -570,6 +597,7 @@ torchvision==0.18.1+cu121
     # via
     #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/stargan/requirements.in
@@ -592,8 +620,10 @@ tqdm==4.66.4
     #   datasets
     #   deepspeed
     #   evaluate
+    #   fvcore
     #   gdown
     #   huggingface-hub
+    #   iopath
     #   lightning
     #   pytorch-lightning
     #   torchrec
@@ -617,6 +647,7 @@ typing-extensions==4.12.2
     #   etils
     #   flax
     #   huggingface-hub
+    #   iopath
     #   lightning
     #   lightning-utilities
     #   orbax-checkpoint
@@ -625,6 +656,7 @@ typing-extensions==4.12.2
     #   pyre-extensions
     #   pytorch-lightning
     #   reactivex
+    #   submitit
     #   torch
     #   typing-inspect
 typing-inspect==0.9.0
@@ -644,6 +676,7 @@ voir==0.2.17
     #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
     #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/dlrm/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/huggingface/requirements.in
@@ -658,10 +691,14 @@ werkzeug==3.0.3
     # via
     #   flask
     #   tensorboard
+xformers==0.0.27
+    # via -r benchmarks/dinov2/requirements.in
 xxhash==3.4.1
     # via
     #   datasets
     #   evaluate
+yacs==0.1.8
+    # via fvcore
 yarl==1.9.4
     # via aiohttp
 zipp==3.19.2
diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/dinov2/requirements.cuda.txt
similarity index 53%
rename from benchmarks/brax/requirements.cuda.txt
rename to benchmarks/dinov2/requirements.cuda.txt
index ea6216a23..a92790725 100644
--- a/benchmarks/brax/requirements.cuda.txt
+++ b/benchmarks/dinov2/requirements.cuda.txt
@@ -2,24 +2,13 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/brax/requirements.cuda.txt .pin/tmp-constraints-cuda-brax.txt benchmarks/brax/requirements.in
+#    pip-compile --output-file=benchmarks/dinov2/requirements.cuda.txt .pin/tmp-constraints-cuda-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --trusted-host pypi.ngc.nvidia.com
 
-absl-py==2.1.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   chex
-    #   dm-env
-    #   ml-collections
-    #   mujoco
-    #   mujoco-mjx
-    #   optax
-    #   orbax-checkpoint
 antlr4-python3-runtime==4.9.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -28,50 +17,14 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-blinker==1.8.2
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flask
-brax==0.10.5
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   -r benchmarks/brax/requirements.in
-chex==0.1.86
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   optax
-click==8.1.7
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flask
 cloudpickle==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   gym
+    #   submitit
 codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-contextlib2==21.6.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   ml-collections
-dm-env==1.6
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-dm-tree==0.1.8
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   dm-env
-etils[epath,epy]==1.7.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   mujoco
-    #   mujoco-mjx
-    #   optax
-    #   orbax-checkpoint
 executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -81,92 +34,32 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-flask==3.0.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   flask-cors
-flask-cors==4.0.1
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-flax==0.8.5
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
 fsspec==2024.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   etils
     #   torch
+fvcore==0.1.5.post20221221
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
 giving==0.4.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-glfw==2.7.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   mujoco
-grpcio==1.65.1
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-gym==0.26.2
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-gym-notices==0.0.8
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   gym
-importlib-resources==6.4.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   etils
-itsdangerous==2.2.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flask
-jax[cuda12]==0.4.28
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   -r benchmarks/brax/requirements.in
-    #   brax
-    #   chex
-    #   flax
-    #   jaxopt
-    #   mujoco-mjx
-    #   optax
-    #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.28
+iopath==0.1.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
-jax-cuda12-plugin==0.4.28
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
-jaxlib==0.4.28+cuda12.cudnn89
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   chex
-    #   jax
-    #   jaxopt
-    #   mujoco-mjx
-    #   optax
-    #   orbax-checkpoint
-jaxopt==0.8.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
 jinja2==3.1.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   flask
     #   torch
+lightning-utilities==0.11.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torchmetrics
 markdown-it-py==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -175,43 +68,14 @@ markupsafe==2.1.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jinja2
-    #   werkzeug
 mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-collections==0.1.1
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-ml-dtypes==0.4.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
-    #   jaxlib
-    #   tensorstore
 mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-msgpack==1.0.8
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flax
-    #   orbax-checkpoint
-mujoco==3.2.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   mujoco-mjx
-mujoco-mjx==3.2.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-nest-asyncio==1.6.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   orbax-checkpoint
 networkx==3.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -219,39 +83,21 @@ networkx==3.3
 numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   chex
-    #   dm-env
-    #   flax
-    #   gym
-    #   jax
-    #   jaxlib
-    #   jaxopt
-    #   ml-dtypes
-    #   mujoco
-    #   opt-einsum
-    #   optax
-    #   orbax-checkpoint
+    #   fvcore
     #   scipy
-    #   tensorboardx
-    #   tensorstore
-    #   trimesh
+    #   torchmetrics
+    #   torchvision
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   torch
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -259,17 +105,14 @@ nvidia-cuda-nvrtc-cu12==12.1.105
 nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   torch
 nvidia-cudnn-cu12==8.9.2.26
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   torch
 nvidia-curand-cu12==10.3.2.106
     # via
@@ -278,23 +121,19 @@ nvidia-curand-cu12==10.3.2.106
 nvidia-cusolver-cu12==11.4.5.107
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   torch
 nvidia-nvjitlink-cu12==12.5.82
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
@@ -304,21 +143,8 @@ nvidia-nvtx-cu12==12.1.105
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
     #   voir
-opt-einsum==3.3.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
-optax==0.2.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   flax
-orbax-checkpoint==0.5.22
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   flax
 ovld==0.3.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -326,16 +152,17 @@ ovld==0.3.5
 packaging==24.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   tensorboardx
+    #   lightning-utilities
+    #   torchmetrics
 pillow==10.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-protobuf==4.25.3
+    #   fvcore
+    #   torchvision
+portalocker==2.10.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   orbax-checkpoint
-    #   tensorboardx
+    #   iopath
 psutil==5.9.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -352,21 +179,12 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-pyopengl==3.1.7
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   mujoco
-pytinyrenderer==0.0.14
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
 pyyaml==6.0.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flax
-    #   ml-collections
+    #   fvcore
     #   omegaconf
-    #   orbax-checkpoint
+    #   yacs
 reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -374,47 +192,51 @@ reactivex==4.0.4
 rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flax
     #   voir
 scipy==1.14.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   jax
-    #   jaxlib
-    #   jaxopt
-    #   mujoco-mjx
+    #   -r benchmarks/dinov2/requirements.in
 six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-    #   ml-collections
+submitit==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
 sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-tensorboardx==2.6.2.2
+tabulate==0.9.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-tensorstore==0.1.63
+    #   fvcore
+termcolor==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flax
-    #   orbax-checkpoint
-toolz==0.12.1
+    #   fvcore
+torch==2.3.1+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   chex
-torch==2.3.1+cu121
+    #   -r benchmarks/dinov2/requirements.in
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+torchmetrics==1.0.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+torchvision==0.18.1+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   -r benchmarks/brax/requirements.in
-trimesh==4.4.3
+    #   -r benchmarks/dinov2/requirements.in
+tqdm==4.66.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   mujoco-mjx
+    #   fvcore
+    #   iopath
 triton==2.3.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -422,12 +244,10 @@ triton==2.3.1
 typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   brax
-    #   chex
-    #   etils
-    #   flax
-    #   orbax-checkpoint
+    #   iopath
+    #   lightning-utilities
     #   reactivex
+    #   submitit
     #   torch
 varname==0.10.0
     # via
@@ -437,12 +257,15 @@ voir==0.2.17
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
-    #   -r benchmarks/brax/requirements.in
-werkzeug==3.0.3
+    #   -r benchmarks/dinov2/requirements.in
+xformers==0.0.27
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   flask
-zipp==3.19.2
+    #   -r benchmarks/dinov2/requirements.in
+yacs==0.1.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   etils
+    #   fvcore
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/huggingface/prepare.py b/benchmarks/huggingface/prepare.py
index d1bdaf280..1f5f80850 100755
--- a/benchmarks/huggingface/prepare.py
+++ b/benchmarks/huggingface/prepare.py
@@ -7,7 +7,7 @@
     args = parser().parse_args()
     print(f"Preparing {args.model}")
     make_config = models[args.model]
-    make_config()
+    make_config(args)
 
     # bert dataset
     # t5 dataset
diff --git a/benchmate/benchmate/datagen.py b/benchmate/benchmate/datagen.py
index daf0ed075..a7a753099 100644
--- a/benchmate/benchmate/datagen.py
+++ b/benchmate/benchmate/datagen.py
@@ -97,6 +97,8 @@ def fakeimagenet_args():
     parser.add_argument("--val", default=0.1, type=float, nargs="+")
     parser.add_argument("--test", default=0.1, type=float, nargs="+")
     args, _ = parser.parse_known_args()
+    return args
+
 
 def generate_fakeimagenet(args=None):
     # config = json.loads(os.environ["MILABENCH_CONFIG"])
diff --git a/milabench/_version.py b/milabench/_version.py
index eddbdfb72..d8ae9287b 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-23-gb9954e68"
-__commit__ = "b9954e68e71a29fff2e7b16d8bcfaf7646629992"
-__date__ = "2024-07-25 12:06:00 -0400"
+__tag__ = "v0.1.0-30-g64aa548b"
+__commit__ = "64aa548ba07d3c6bb298e435b8ac43c69eb75738"
+__date__ = "2024-07-26 13:07:25 -0400"
diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh
index 5e69f88cb..7651864e8 100644
--- a/scripts/article/run_cuda_dev.sh
+++ b/scripts/article/run_cuda_dev.sh
@@ -51,7 +51,7 @@ install_prepare() {
     . $MILABENCH_WORDIR/env/bin/activate
     pip install -e $MILABENCH_SOURCE
 
-    # milabench pin --variant cuda "$@"
+    milabench pin --variant cuda "$@"
 
     #
     # Install milabench's benchmarks in their venv
@@ -69,7 +69,7 @@ install_prepare() {
         # pip install -e $MILABENCH_WORDIR/torchcompat
         # pip install torch torchvision torchaudio
 
-        pip install fvcore xFormers
+        # pip install fvcore xFormers
 
         # DALI stuff
         # pip install --extra-index-url https://pypi.nvidia.com --upgrade nvidia-dali-cuda120
diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt
index 35f198150..95d1d5f4c 100644
--- a/tests/test_command_reg/test_command_reg_one_node.txt
+++ b/tests/test_command_reg/test_command_reg_one_node.txt
@@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data
 export MILABENCH_DIR_RUNS=$BASE/runs
 export MILABENCH_DIR_EXTRA=$BASE/extra/llm
 export MILABENCH_DIR_CACHE=$BASE/cache
+export OMP_NUM_THREADS=128
 export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
-export OMP_NUM_THREADS=8
 
 echo "---"
 echo "llama"
@@ -124,8 +124,8 @@ time (
 )
 
 echo "---"
-echo "resnet152-ddp"
-echo "============="
+echo "resnet152-ddp-gpus"
+echo "=================="
 time (
   $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
@@ -327,10 +327,10 @@ time (
 )
 
 echo "---"
-echo "resnet152-multi"
-echo "==============="
+echo "resnet152-gpus"
+echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-multi.0 --checkpoint-hist 1 &
+  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-gpus.0 --checkpoint-hist 1 &
   wait
 )
 
@@ -350,10 +350,10 @@ time (
 )
 
 echo "---"
-echo "davit_large-multi"
-echo "================="
+echo "davit_large-gpus"
+echo "================"
 time (
-  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-multi.0 --checkpoint-hist 1 &
+  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-gpus.0 --checkpoint-hist 1 &
   wait
 )
 
@@ -373,32 +373,32 @@ time (
 )
 
 echo "---"
-echo "opt-1_3b"
-echo "========"
+echo "opt-1_3b-gpus"
+echo "============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-1.3b &
   wait
 )
 
 echo "---"
-echo "opt-1_3b-multinode"
-echo "=================="
+echo "opt-1_3b-nodes"
+echo "=============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-1.3b &
   wait
 )
 
 echo "---"
-echo "opt-6_7b"
-echo "========"
+echo "opt-6_7b-gpus"
+echo "============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-6.7b &
   wait
 )
 
 echo "---"
-echo "opt-6_7b-multinode"
-echo "=================="
+echo "opt-6_7b-nodes"
+echo "=============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-6.7b &
   wait
@@ -454,7 +454,7 @@ echo "---"
 echo "diffusion-gpus"
 echo "=============="
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --train_batch_size 32 --num_epochs 5 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
   wait
 )
 
@@ -462,14 +462,14 @@ echo "---"
 echo "lightning"
 echo "========="
 time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
   wait
 )
 
@@ -477,7 +477,15 @@ echo "---"
 echo "lightning-gpus"
 echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt
index 2817f77f8..387d2d474 100644
--- a/tests/test_command_reg/test_command_reg_two_nodes.txt
+++ b/tests/test_command_reg/test_command_reg_two_nodes.txt
@@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data
 export MILABENCH_DIR_RUNS=$BASE/runs
 export MILABENCH_DIR_EXTRA=$BASE/extra/llm
 export MILABENCH_DIR_CACHE=$BASE/cache
+export OMP_NUM_THREADS=128
 export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}, {"ip": "192.168.0.11", "main": false, "name": "1", "port": 22, "user": "username", "hostname": "192.168.0.11", "aliaslist": [], "ipaddrlist": ["192.168.0.11"], "local": false}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
-export OMP_NUM_THREADS=8
 
 echo "---"
 echo "llama"
@@ -124,8 +124,8 @@ time (
 )
 
 echo "---"
-echo "resnet152-ddp"
-echo "============="
+echo "resnet152-ddp-gpus"
+echo "=================="
 time (
   $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
@@ -327,10 +327,10 @@ time (
 )
 
 echo "---"
-echo "resnet152-multi"
-echo "==============="
+echo "resnet152-gpus"
+echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-multi.0 --checkpoint-hist 1 &
+  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-gpus.0 --checkpoint-hist 1 &
   wait
 )
 
@@ -350,10 +350,10 @@ time (
 )
 
 echo "---"
-echo "davit_large-multi"
-echo "================="
+echo "davit_large-gpus"
+echo "================"
 time (
-  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-multi.0 --checkpoint-hist 1 &
+  $BASE/venv/torch/bin/benchrun --nproc-per-node=8 -m python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-gpus.0 --checkpoint-hist 1 &
   wait
 )
 
@@ -373,16 +373,16 @@ time (
 )
 
 echo "---"
-echo "opt-1_3b"
-echo "========"
+echo "opt-1_3b-gpus"
+echo "============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-1.3b &
   wait
 )
 
 echo "---"
-echo "opt-1_3b-multinode"
-echo "=================="
+echo "opt-1_3b-nodes"
+echo "=============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 &
   ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 &
@@ -390,16 +390,16 @@ time (
 )
 
 echo "---"
-echo "opt-6_7b"
-echo "========"
+echo "opt-6_7b-gpus"
+echo "============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --cache $BASE/cache --model_name facebook/opt-6.7b &
   wait
 )
 
 echo "---"
-echo "opt-6_7b-multinode"
-echo "=================="
+echo "opt-6_7b-nodes"
+echo "=============="
 time (
   $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 &
   ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 &
@@ -456,7 +456,7 @@ echo "---"
 echo "diffusion-gpus"
 echo "=============="
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --train_batch_size 32 --num_epochs 5 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
   wait
 )
 
@@ -464,14 +464,14 @@ echo "---"
 echo "lightning"
 echo "========="
 time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
   wait
 )
 
@@ -479,7 +479,15 @@ echo "---"
 echo "lightning-gpus"
 echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
diff --git a/tests/test_mock/test_milabench_bad_run.txt b/tests/test_mock/test_milabench_bad_run.txt
index f30881ec1..e9b4ffa42 100644
--- a/tests/test_mock/test_milabench_bad_run.txt
+++ b/tests/test_mock/test_milabench_bad_run.txt
@@ -13,7 +13,7 @@ benchio.0
         | Traceback (most recent call last):
         |   File "$TMP/venv/benchio/bin/voir", line 8, in <module>
         |     sys.exit(main())
-        |   File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 124, in main
+        |   File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 128, in main
         |     ov(sys.argv[1:] if argv is None else argv)
         |   File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/phase.py", line 331, in __call__
         |     self._run(*args, **kwargs)
@@ -35,7 +35,7 @@ benchio.1
         | Traceback (most recent call last):
         |   File "$TMP/venv/benchio/bin/voir", line 8, in <module>
         |     sys.exit(main())
-        |   File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 124, in main
+        |   File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/cli.py", line 128, in main
         |     ov(sys.argv[1:] if argv is None else argv)
         |   File "$TMP/venv/benchio/lib/python3.10/site-packages/voir/phase.py", line 331, in __call__
         |     self._run(*args, **kwargs)