diff --git a/.gitignore b/.gitignore
index 482e776df..90a1e78d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,7 +32,8 @@ scripts/article/cuda/
 scripts/article/xpu/
 
 dependencies/
-benchmarks/gflownet/gflownet
+benchmarks/recursiongfn/gflownet
+benchmarks/recursiongfn/logs/
 
 scripts/inventory.yaml
 output/
diff --git a/.pin/constraints-cuda-gnn.txt b/.pin/constraints-cuda-gnn.txt
new file mode 100644
index 000000000..cc12cdab2
--- /dev/null
+++ b/.pin/constraints-cuda-gnn.txt
@@ -0,0 +1,337 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=.pin/constraints-cuda-gnn.txt .pin/tmp-constraints.txt benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/recursiongfn/requirements.in constraints/extra/gnn.cuda.txt
+#
+--extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--trusted-host pypi.ngc.nvidia.com
+
+absl-py==2.1.0
+    # via tensorboard
+aiohappyeyeballs==2.4.0
+    # via aiohttp
+aiohttp==3.10.5
+    # via torch-geometric
+aiosignal==1.3.1
+    # via aiohttp
+antlr4-python3-runtime==4.9.3
+    # via omegaconf
+asttokens==2.4.1
+    # via giving
+async-timeout==4.0.3
+    # via aiohttp
+attrs==24.2.0
+    # via aiohttp
+blosc2==2.7.1
+    # via tables
+botorch==0.11.3
+    # via -r benchmarks/recursiongfn/requirements.in
+certifi==2024.7.4
+    # via
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via wandb
+codefind==0.1.6
+    # via ptera
+cvxopt==1.3.2
+    # via -r benchmarks/recursiongfn/requirements.in
+docker-pycreds==0.4.0
+    # via wandb
+executing==1.2.0
+    # via varname
+filelock==3.15.4
+    # via
+    #   torch
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   torch
+    #   torch-geometric
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   wandb
+giving==0.4.2
+    # via
+    #   ptera
+    #   voir
+gpytorch==1.12
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+grpcio==1.66.0
+    # via tensorboard
+idna==3.8
+    # via
+    #   requests
+    #   yarl
+jaxtyping==0.2.33
+    # via linear-operator
+jinja2==3.1.4
+    # via
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via scikit-learn
+linear-operator==0.5.2
+    # via
+    #   botorch
+    #   gpytorch
+markdown==3.7
+    # via tensorboard
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via
+    #   botorch
+    #   gpytorch
+    #   sympy
+msgpack==1.0.8
+    # via blosc2
+multidict==6.0.5
+    # via
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via botorch
+ndindex==1.8
+    # via blosc2
+networkx==3.3
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torch
+numexpr==2.10.1
+    # via
+    #   blosc2
+    #   tables
+numpy==1.26.4
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   blosc2
+    #   botorch
+    #   numexpr
+    #   opt-einsum
+    #   pandas
+    #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   tables
+    #   tensorboard
+    #   torch-geometric
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via voir
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+omegaconf==2.3.0
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   voir
+opt-einsum==3.3.0
+    # via pyro-ppl
+ovld==0.3.9
+    # via voir
+packaging==24.1
+    # via
+    #   tables
+    #   tensorboard
+pandas==2.2.2
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+pillow==10.4.0
+    # via rdkit
+platformdirs==4.2.2
+    # via wandb
+protobuf==5.27.3
+    # via
+    #   tensorboard
+    #   wandb
+psutil==5.9.8
+    # via
+    #   torch-geometric
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via voir
+py-cpuinfo==9.0.0
+    # via
+    #   blosc2
+    #   tables
+pyarrow==17.0.0
+    # via -r benchmarks/recursiongfn/requirements.in
+pygments==2.18.0
+    # via rich
+pyparsing==3.1.4
+    # via torch-geometric
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   omegaconf
+    #   wandb
+rdkit==2024.3.5
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+reactivex==4.0.4
+    # via giving
+requests==2.32.3
+    # via
+    #   torch-geometric
+    #   wandb
+rich==13.8.0
+    # via voir
+scikit-learn==1.5.1
+    # via
+    #   gpytorch
+    #   torch-geometric
+scipy==1.14.1
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   gpytorch
+    #   linear-operator
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-geometric
+    #   torch-sparse
+sentry-sdk==2.13.0
+    # via wandb
+setproctitle==1.3.3
+    # via wandb
+six==1.16.0
+    # via
+    #   asttokens
+    #   docker-pycreds
+    #   python-dateutil
+    #   tensorboard
+smmap==5.0.1
+    # via gitdb
+sympy==1.13.2
+    # via torch
+tables==3.10.1
+    # via -r benchmarks/recursiongfn/requirements.in
+tensorboard==2.17.1
+    # via -r benchmarks/recursiongfn/requirements.in
+tensorboard-data-server==0.7.2
+    # via tensorboard
+threadpoolctl==3.5.0
+    # via scikit-learn
+torch==2.3.1+cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements-pre.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r constraints/extra/gnn.cuda.txt
+    #   botorch
+    #   linear-operator
+    #   pyro-ppl
+torch-cluster==1.6.3+pt23cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.5.3
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-scatter==2.1.2+pt23cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-sparse==0.6.18+pt23cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+tqdm==4.66.5
+    # via
+    #   pyro-ppl
+    #   torch-geometric
+triton==2.3.1
+    # via torch
+typeguard==2.13.3
+    # via
+    #   jaxtyping
+    #   linear-operator
+typing-extensions==4.12.2
+    # via
+    #   reactivex
+    #   tables
+    #   torch
+tzdata==2024.1
+    # via pandas
+urllib3==2.2.2
+    # via
+    #   requests
+    #   sentry-sdk
+varname==0.10.0
+    # via giving
+voir==0.2.19
+    # via
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+wandb==0.17.7
+    # via -r benchmarks/recursiongfn/requirements.in
+werkzeug==3.0.4
+    # via tensorboard
+yarl==1.9.4
+    # via aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt
index d691495d5..09c97e23f 100644
--- a/.pin/constraints-cuda-torch.txt
+++ b/.pin/constraints-cuda-torch.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
+#    pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in constraints/extra/torch.cuda.txt
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
@@ -19,13 +19,14 @@ absl-py==2.1.0
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   tensorboard
 accelerate==0.33.0
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   diffusers
-aiohappyeyeballs==2.3.4
+aiohappyeyeballs==2.4.0
     # via aiohttp
-aiohttp==3.10.0
+aiohttp==3.10.5
     # via
     #   datasets
     #   fsspec
@@ -33,6 +34,8 @@ aiosignal==1.3.1
     # via aiohttp
 antlr4-python3-runtime==4.9.3
     # via omegaconf
+appdirs==1.4.4
+    # via cantilever
 argklass==1.4.4
     # via
     #   -r benchmarks/diffusion/requirements.in
@@ -41,14 +44,16 @@ asttokens==2.4.1
     # via giving
 async-timeout==4.0.3
     # via aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via aiohttp
 blinker==1.8.2
     # via flask
-blobfile==2.1.1
+blobfile==3.0.0
     # via torchtune
 brax==0.10.5
     # via -r benchmarks/brax/requirements.in
+cantilever==0.1.0
+    # via -r benchmarks/torchatari/requirements.in
 certifi==2024.7.4
     # via requests
 charset-normalizer==3.3.2
@@ -60,26 +65,33 @@ click==8.1.7
 cloudpickle==3.0.0
     # via
     #   gym
+    #   gymnasium
     #   submitit
 codefind==0.1.6
     # via ptera
 contextlib2==21.6.0
     # via ml-collections
-datasets==2.20.0
+datasets==2.21.0
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llama/requirements.in
     #   torchtune
-diffusers[torch]==0.29.2
+diffusers[torch]==0.30.1
     # via -r benchmarks/diffusion/requirements.in
 dill==0.3.8
     # via
     #   datasets
     #   multiprocess
 dm-env==1.6
-    # via brax
+    # via
+    #   brax
+    #   envpool
 dm-tree==0.1.8
     # via dm-env
+docstring-parser==0.16
+    # via tyro
+envpool==0.8.4
+    # via -r benchmarks/torchatari/requirements.in
 etils[epath,epy]==1.7.0
     # via
     #   brax
@@ -91,6 +103,8 @@ executing==1.2.0
     # via varname
 fairscale==0.4.13
     # via -r benchmarks/llama/requirements.in
+farama-notifications==0.0.4
+    # via gymnasium
 filelock==3.15.4
     # via
     #   blobfile
@@ -108,13 +122,13 @@ flask==3.0.3
     #   flask-cors
 flask-cors==4.0.1
     # via brax
-flax==0.8.5
+flax==0.9.0
     # via brax
 frozenlist==1.4.1
     # via
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   datasets
     #   etils
@@ -130,15 +144,22 @@ giving==0.4.2
     #   voir
 glfw==2.7.0
     # via mujoco
-grpcio==1.65.2
-    # via brax
-gym==0.26.2
-    # via brax
+grpcio==1.66.0
+    # via
+    #   brax
+    #   tensorboard
+gym==0.23.1
+    # via
+    #   -r benchmarks/torchatari/requirements.in
+    #   brax
+    #   envpool
 gym-notices==0.0.8
     # via gym
+gymnasium==0.29.1
+    # via envpool
 hjson==3.1.0
     # via argklass
-huggingface-hub==0.24.5
+huggingface-hub==0.24.6
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
@@ -147,15 +168,18 @@ huggingface-hub==0.24.5
     #   tokenizers
     #   torchtune
     #   transformers
-idna==3.7
+humanize==4.10.0
+    # via orbax-checkpoint
+idna==3.8
     # via
     #   requests
     #   yarl
-importlib-metadata==8.2.0
+importlib-metadata==8.4.0
     # via diffusers
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   argklass
+    #   cantilever
     #   etils
     #   torchcompat
 iopath==0.1.10
@@ -167,6 +191,7 @@ itsdangerous==2.2.0
 jax[cuda12]==0.4.31
     # via
     #   -r benchmarks/brax/requirements.in
+    #   -r constraints/extra/torch.cuda.txt
     #   brax
     #   chex
     #   flax
@@ -194,15 +219,17 @@ jinja2==3.1.4
     #   brax
     #   flask
     #   torch
-lightning==2.3.3
+lightning==2.4.0
     # via -r benchmarks/lightning/requirements.in
 lightning-utilities==0.11.6
     # via
     #   lightning
     #   pytorch-lightning
     #   torchmetrics
-lxml==4.9.4
+lxml==5.3.0
     # via blobfile
+markdown==3.7
+    # via tensorboard
 markdown-it-py==3.0.0
     # via rich
 markupsafe==2.1.5
@@ -224,11 +251,11 @@ msgpack==1.0.8
     # via
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.2
     # via
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.2
     # via brax
 multidict==6.0.5
     # via
@@ -243,20 +270,21 @@ networkx==3.3
 numpy==1.26.4
     # via
     #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   accelerate
     #   brax
     #   chex
     #   datasets
     #   diffusers
     #   dm-env
+    #   envpool
     #   fairscale
-    #   flax
     #   fvcore
     #   gym
+    #   gymnasium
     #   jax
     #   jaxlib
     #   jaxopt
-    #   lightning
     #   ml-dtypes
     #   mujoco
     #   opencv-python
@@ -265,8 +293,8 @@ numpy==1.26.4
     #   orbax-checkpoint
     #   pandas
     #   pyarrow
-    #   pytorch-lightning
     #   scipy
+    #   tensorboard
     #   tensorboardx
     #   tensorstore
     #   torchmetrics
@@ -312,6 +340,8 @@ nvidia-cusparse-cu12==12.1.0.106
     #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   jax-cuda12-plugin
@@ -336,20 +366,24 @@ optax==0.2.3
     # via
     #   brax
     #   flax
-orbax-checkpoint==0.5.23
+optree==0.12.1
+    # via envpool
+orbax-checkpoint==0.6.1
     # via
     #   brax
     #   flax
-ovld==0.3.6
+ovld==0.3.9
     # via voir
 packaging==24.1
     # via
     #   accelerate
     #   datasets
+    #   envpool
     #   huggingface-hub
     #   lightning
     #   lightning-utilities
     #   pytorch-lightning
+    #   tensorboard
     #   tensorboardx
     #   torchmetrics
     #   transformers
@@ -367,6 +401,7 @@ portalocker==2.10.1
 protobuf==5.27.3
     # via
     #   orbax-checkpoint
+    #   tensorboard
     #   tensorboardx
 psutil==5.9.8
     # via
@@ -376,25 +411,21 @@ ptera==1.4.1
     # via voir
 pyarrow==17.0.0
     # via datasets
-pyarrow-hotfix==0.6
-    # via datasets
 pycryptodomex==3.20.0
     # via blobfile
 pygments==2.18.0
     # via rich
-pynvml==11.5.3
-    # via voir
 pyopengl==3.1.7
     # via mujoco
 python-dateutil==2.9.0.post0
     # via pandas
 pytinyrenderer==0.0.14
     # via brax
-pytorch-lightning==2.3.3
+pytorch-lightning==2.4.0
     # via lightning
 pytz==2024.1
     # via pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/timm/requirements.in
@@ -424,18 +455,19 @@ requests==2.32.3
     #   huggingface-hub
     #   tiktoken
     #   transformers
-rich==13.7.1
+rich==13.8.0
     # via
     #   flax
+    #   tyro
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   diffusers
     #   torchtune
     #   transformers
-scipy==1.14.0
+scipy==1.14.1
     # via
     #   -r benchmarks/dinov2/requirements.in
     #   brax
@@ -447,21 +479,28 @@ sentencepiece==0.2.0
     # via
     #   -r benchmarks/llama/requirements.in
     #   torchtune
+shtab==1.7.1
+    # via tyro
 six==1.16.0
     # via
     #   asttokens
     #   fire
     #   ml-collections
     #   python-dateutil
+    #   tensorboard
 submitit==1.5.1
     # via -r benchmarks/dinov2/requirements.in
-sympy==1.13.1
+sympy==1.13.2
     # via torch
 tabulate==0.9.0
     # via fvcore
+tensorboard==2.17.1
+    # via -r benchmarks/torchatari/requirements.in
+tensorboard-data-server==0.7.2
+    # via tensorboard
 tensorboardx==2.6.2.2
     # via brax
-tensorstore==0.1.63
+tensorstore==0.1.64
     # via
     #   flax
     #   orbax-checkpoint
@@ -486,6 +525,7 @@ torch==2.4.0+cu121
     #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
     #   accelerate
@@ -503,9 +543,10 @@ torchcompat==1.1.4
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/lightning/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchmetrics==1.4.0.post0
+torchmetrics==1.4.1
     # via
     #   -r benchmarks/dinov2/requirements.in
     #   lightning
@@ -522,7 +563,7 @@ torchvision==0.19.0+cu121
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/flops/requirements.in
@@ -537,32 +578,40 @@ tqdm==4.66.4
     #   pytorch-lightning
     #   torchtune
     #   transformers
-transformers==4.43.3
+transformers==4.44.2
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/llama/requirements.in
-trimesh==4.4.3
+trimesh==4.4.7
     # via
     #   brax
     #   mujoco-mjx
 triton==3.0.0
     # via torch
+types-protobuf==5.27.0.20240626
+    # via envpool
 typing-extensions==4.12.2
     # via
     #   brax
     #   chex
+    #   envpool
     #   etils
     #   flax
+    #   gymnasium
     #   huggingface-hub
     #   iopath
     #   lightning
     #   lightning-utilities
+    #   optree
     #   orbax-checkpoint
     #   pytorch-lightning
     #   reactivex
     #   submitit
     #   torch
+    #   tyro
+tyro==0.8.10
+    # via -r benchmarks/torchatari/requirements.in
 tzdata==2024.1
     # via pandas
 urllib3==2.2.2
@@ -584,19 +633,22 @@ voir==0.2.19
     #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-werkzeug==3.0.3
-    # via flask
+werkzeug==3.0.4
+    # via
+    #   flask
+    #   tensorboard
 xformers==0.0.27.post2
     # via -r benchmarks/dinov2/requirements.in
-xxhash==3.4.1
+xxhash==3.5.0
     # via datasets
 yacs==0.1.8
     # via fvcore
 yarl==1.9.4
     # via aiohttp
-zipp==3.19.2
+zipp==3.20.1
     # via
     #   etils
     #   importlib-metadata
diff --git a/.pin/constraints-rocm-gnn.txt b/.pin/constraints-rocm-gnn.txt
new file mode 100644
index 000000000..dd945fc95
--- /dev/null
+++ b/.pin/constraints-rocm-gnn.txt
@@ -0,0 +1,305 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=.pin/constraints-rocm-gnn.txt .pin/tmp-constraints.txt benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/recursiongfn/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+absl-py==2.1.0
+    # via tensorboard
+aiohappyeyeballs==2.4.0
+    # via aiohttp
+aiohttp==3.10.5
+    # via torch-geometric
+aiosignal==1.3.1
+    # via aiohttp
+antlr4-python3-runtime==4.9.3
+    # via omegaconf
+asttokens==2.4.1
+    # via giving
+async-timeout==4.0.3
+    # via aiohttp
+attrs==24.2.0
+    # via aiohttp
+blosc2==2.7.1
+    # via tables
+botorch==0.11.3
+    # via gflownet
+certifi==2024.7.4
+    # via
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via wandb
+codefind==0.1.6
+    # via ptera
+cvxopt==1.3.2
+    # via gflownet
+docker-pycreds==0.4.0
+    # via wandb
+executing==1.2.0
+    # via varname
+filelock==3.15.4
+    # via
+    #   pytorch-triton-rocm
+    #   torch
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   torch
+    #   torch-geometric
+gflownet @ git+https://github.com/Delaunay/gflownet@milabench
+    # via -r benchmarks/recursiongfn/requirements.in
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via
+    #   gflownet
+    #   wandb
+giving==0.4.2
+    # via
+    #   ptera
+    #   voir
+gpytorch==1.12
+    # via
+    #   botorch
+    #   gflownet
+grpcio==1.65.5
+    # via tensorboard
+idna==3.7
+    # via
+    #   requests
+    #   yarl
+jaxtyping==0.2.33
+    # via linear-operator
+jinja2==3.1.4
+    # via
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via scikit-learn
+linear-operator==0.5.2
+    # via
+    #   botorch
+    #   gpytorch
+markdown==3.7
+    # via tensorboard
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via
+    #   botorch
+    #   gpytorch
+    #   sympy
+msgpack==1.0.8
+    # via blosc2
+multidict==6.0.5
+    # via
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via botorch
+ndindex==1.8
+    # via blosc2
+networkx==3.3
+    # via
+    #   gflownet
+    #   torch
+numexpr==2.10.1
+    # via
+    #   blosc2
+    #   tables
+numpy==1.26.4
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   blosc2
+    #   botorch
+    #   numexpr
+    #   opt-einsum
+    #   pandas
+    #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   tables
+    #   tensorboard
+    #   torch-geometric
+omegaconf==2.3.0
+    # via
+    #   gflownet
+    #   voir
+opt-einsum==3.3.0
+    # via pyro-ppl
+ovld==0.3.8
+    # via voir
+packaging==24.1
+    # via
+    #   tables
+    #   tensorboard
+pandas==2.2.2
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   gflownet
+pillow==10.4.0
+    # via rdkit
+platformdirs==4.2.2
+    # via wandb
+protobuf==5.27.3
+    # via
+    #   tensorboard
+    #   wandb
+psutil==5.9.8
+    # via
+    #   torch-geometric
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via voir
+py-cpuinfo==9.0.0
+    # via
+    #   blosc2
+    #   tables
+pyarrow==17.0.0
+    # via gflownet
+pygments==2.18.0
+    # via rich
+pynvml==11.5.3
+    # via voir
+pyparsing==3.1.2
+    # via torch-geometric
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   botorch
+    #   gflownet
+python-dateutil==2.9.0.post0
+    # via pandas
+pytorch-triton-rocm==3.0.0
+    # via torch
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   omegaconf
+    #   wandb
+rdkit==2024.3.5
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   gflownet
+reactivex==4.0.4
+    # via giving
+requests==2.32.3
+    # via
+    #   torch-geometric
+    #   wandb
+rich==13.7.1
+    # via voir
+scikit-learn==1.5.1
+    # via
+    #   gpytorch
+    #   torch-geometric
+scipy==1.14.0
+    # via
+    #   botorch
+    #   gflownet
+    #   gpytorch
+    #   linear-operator
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-geometric
+    #   torch-sparse
+sentry-sdk==2.13.0
+    # via wandb
+setproctitle==1.3.3
+    # via wandb
+six==1.16.0
+    # via
+    #   asttokens
+    #   docker-pycreds
+    #   python-dateutil
+    #   tensorboard
+smmap==5.0.1
+    # via gitdb
+sympy==1.13.2
+    # via torch
+tables==3.10.1
+    # via gflownet
+tensorboard==2.17.1
+    # via gflownet
+tensorboard-data-server==0.7.2
+    # via tensorboard
+threadpoolctl==3.5.0
+    # via scikit-learn
+torch==2.4.0+rocm6.0
+    # via
+    #   -r benchmarks/geo_gnn/requirements-pre.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   gflownet
+    #   linear-operator
+    #   pyro-ppl
+torch-cluster==1.6.3
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   gflownet
+torch-geometric==2.5.3
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   gflownet
+torch-scatter==2.1.2
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   gflownet
+torch-sparse==0.6.18
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   gflownet
+tqdm==4.66.5
+    # via
+    #   pyro-ppl
+    #   torch-geometric
+typeguard==2.13.3
+    # via
+    #   jaxtyping
+    #   linear-operator
+typing-extensions==4.12.2
+    # via
+    #   reactivex
+    #   tables
+    #   torch
+tzdata==2024.1
+    # via pandas
+urllib3==2.2.2
+    # via
+    #   requests
+    #   sentry-sdk
+varname==0.10.0
+    # via giving
+voir==0.2.17
+    # via
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+wandb==0.17.7
+    # via gflownet
+werkzeug==3.0.3
+    # via tensorboard
+yarl==1.9.4
+    # via aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt
index f2a057ae5..4fe6ae9da 100644
--- a/.pin/constraints-rocm-torch.txt
+++ b/.pin/constraints-rocm-torch.txt
@@ -2,12 +2,9 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
+#    pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
@@ -20,31 +17,41 @@ absl-py==2.1.0
     #   optax
     #   orbax-checkpoint
     #   tensorboard
-accelerate==0.32.1
-    # via -r benchmarks/accelerate_opt/requirements.in
-aiohttp==3.9.5
+accelerate==0.33.0
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   diffusers
+aiohappyeyeballs==2.4.0
+    # via aiohttp
+aiohttp==3.10.5
     # via
     #   datasets
     #   fsspec
 aiosignal==1.3.1
     # via aiohttp
-annotated-types==0.7.0
-    # via pydantic
 antlr4-python3-runtime==4.9.3
     # via omegaconf
+appdirs==1.4.4
+    # via cantilever
+argklass==1.4.4
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/llm/requirements.in
 asttokens==2.4.1
     # via giving
 async-timeout==4.0.3
     # via aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via aiohttp
-beautifulsoup4==4.12.3
-    # via gdown
 blinker==1.8.2
     # via flask
+blobfile==2.1.1
+    # via torchtune
 brax==0.10.5
     # via -r benchmarks/brax/requirements.in
-certifi==2024.6.2
+cantilever==0.1.0
+    # via -r benchmarks/torchatari/requirements.in
+certifi==2024.7.4
     # via requests
 charset-normalizer==3.3.2
     # via requests
@@ -53,31 +60,35 @@ chex==0.1.86
 click==8.1.7
     # via flask
 cloudpickle==3.0.0
-    # via gym
+    # via
+    #   gym
+    #   gymnasium
+    #   submitit
 codefind==0.1.6
     # via ptera
 contextlib2==21.6.0
     # via ml-collections
-datasets==2.20.0
+datasets==2.21.0
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   evaluate
-deepspeed==0.14.4
-    # via -r benchmarks/accelerate_opt/requirements.in
+    #   torchtune
+diffusers[torch]==0.30.0
+    # via -r benchmarks/diffusion/requirements.in
 dill==0.3.8
     # via
     #   datasets
-    #   evaluate
     #   multiprocess
 dm-env==1.6
-    # via brax
+    # via
+    #   brax
+    #   envpool
 dm-tree==0.1.8
     # via dm-env
-docker==7.1.0
-    # via torchx
 docstring-parser==0.16
-    # via torchx
+    # via tyro
+envpool==0.8.4
+    # via -r benchmarks/torchatari/requirements.in
 etils[epath,epy]==1.7.0
     # via
     #   brax
@@ -85,22 +96,20 @@ etils[epath,epy]==1.7.0
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-evaluate==0.4.2
-    # via -r benchmarks/accelerate_opt/requirements.in
 executing==1.2.0
     # via varname
 fairscale==0.4.13
     # via -r benchmarks/llama/requirements.in
-fbgemm-gpu==0.7.0+rocm6.0
-    # via torchrec
+farama-notifications==0.0.4
+    # via gymnasium
 filelock==3.15.4
     # via
+    #   blobfile
     #   datasets
-    #   gdown
+    #   diffusers
     #   huggingface-hub
     #   pytorch-triton-rocm
     #   torch
-    #   torchx
     #   transformers
 fire==0.6.0
     # via -r benchmarks/llama/requirements.in
@@ -116,57 +125,67 @@ frozenlist==1.4.1
     # via
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   datasets
     #   etils
-    #   evaluate
     #   huggingface-hub
+    #   lightning
+    #   pytorch-lightning
     #   torch
-    #   torchx
-future==1.0.0
-    # via -r benchmarks/dlrm/requirements.in
-gdown==5.2.0
-    # via -r benchmarks/stargan/requirements.in
+fvcore==0.1.5.post20221221
+    # via -r benchmarks/dinov2/requirements.in
 giving==0.4.2
     # via
     #   ptera
     #   voir
 glfw==2.7.0
     # via mujoco
-graphviz==0.20.3
-    # via torchviz
-grpcio==1.65.1
+grpcio==1.65.5
     # via
     #   brax
     #   tensorboard
-gym==0.26.2
-    # via brax
+gym==0.23.1
+    # via
+    #   -r benchmarks/torchatari/requirements.in
+    #   brax
+    #   envpool
 gym-notices==0.0.8
     # via gym
+gymnasium==0.29.1
+    # via envpool
 hjson==3.1.0
-    # via deepspeed
-huggingface-hub==0.23.5
+    # via argklass
+huggingface-hub==0.24.6
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
-    #   evaluate
+    #   diffusers
     #   tokenizers
+    #   torchtune
     #   transformers
+humanize==4.10.0
+    # via orbax-checkpoint
 idna==3.7
     # via
     #   requests
     #   yarl
-importlib-metadata==8.0.0
-    # via torchx
-importlib-resources==6.4.0
+importlib-metadata==8.4.0
+    # via diffusers
+importlib-resources==6.4.3
     # via
+    #   argklass
+    #   cantilever
     #   etils
     #   torchcompat
+iopath==0.1.10
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
 itsdangerous==2.2.0
     # via flask
-jax[cuda12]==0.4.30
+jax==0.4.31
     # via
     #   -r benchmarks/brax/requirements.in
     #   brax
@@ -176,11 +195,7 @@ jax[cuda12]==0.4.30
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.30
-    # via jax-cuda12-plugin
-jax-cuda12-plugin[with-cuda]==0.4.30
-    # via jax
-jaxlib==0.4.30
+jaxlib==0.4.31
     # via
     #   brax
     #   chex
@@ -196,11 +211,16 @@ jinja2==3.1.4
     #   brax
     #   flask
     #   torch
-joblib==1.4.2
-    # via scikit-learn
-lightning-utilities==0.11.5
-    # via torchmetrics
-markdown==3.6
+lightning==2.4.0
+    # via -r benchmarks/lightning/requirements.in
+lightning-utilities==0.11.6
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+lxml==4.9.4
+    # via blobfile
+markdown==3.7
     # via tensorboard
 markdown-it-py==3.0.0
     # via rich
@@ -223,100 +243,64 @@ msgpack==1.0.8
     # via
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.2
     # via
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.2
     # via brax
 multidict==6.0.5
     # via
     #   aiohttp
     #   yarl
 multiprocess==0.70.16
-    # via
-    #   datasets
-    #   evaluate
-mypy-extensions==1.0.0
-    # via typing-inspect
+    # via datasets
 nest-asyncio==1.6.0
     # via orbax-checkpoint
 networkx==3.3
     # via torch
-ninja==1.11.1.1
-    # via deepspeed
 numpy==1.26.4
     # via
-    #   -r benchmarks/dlrm/requirements.in
-    #   -r benchmarks/stargan/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   accelerate
     #   brax
     #   chex
     #   datasets
-    #   deepspeed
+    #   diffusers
     #   dm-env
-    #   evaluate
+    #   envpool
     #   fairscale
-    #   fbgemm-gpu
     #   flax
+    #   fvcore
     #   gym
+    #   gymnasium
     #   jax
     #   jaxlib
     #   jaxopt
     #   ml-dtypes
     #   mujoco
-    #   onnx
     #   opencv-python
     #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   pandas
     #   pyarrow
-    #   scikit-learn
     #   scipy
     #   tensorboard
     #   tensorboardx
     #   tensorstore
     #   torchmetrics
+    #   torchtune
     #   torchvision
     #   transformers
     #   trimesh
-nvidia-cublas-cu12==12.5.3.2
-    # via
-    #   jax-cuda12-plugin
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-nvidia-cuda-cupti-cu12==12.5.82
-    # via jax-cuda12-plugin
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via jax-cuda12-plugin
-nvidia-cuda-runtime-cu12==12.5.82
-    # via jax-cuda12-plugin
-nvidia-cudnn-cu12==9.2.1.18
-    # via jax-cuda12-plugin
-nvidia-cufft-cu12==11.2.3.61
-    # via jax-cuda12-plugin
-nvidia-cusolver-cu12==11.6.3.83
-    # via jax-cuda12-plugin
-nvidia-cusparse-cu12==12.5.1.3
-    # via
-    #   jax-cuda12-plugin
-    #   nvidia-cusolver-cu12
-nvidia-ml-py==12.555.43
-    # via deepspeed
-nvidia-nccl-cu12==2.22.3
-    # via jax-cuda12-plugin
-nvidia-nvjitlink-cu12==12.5.82
-    # via
-    #   jax-cuda12-plugin
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
+    #   xformers
 omegaconf==2.3.0
-    # via voir
-onnx==1.16.1
-    # via -r benchmarks/dlrm/requirements.in
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   torchtune
+    #   voir
 opencv-python==4.10.0.84
     # via -r benchmarks/super-slomo/requirements.in
 opt-einsum==3.3.0
@@ -325,122 +309,125 @@ optax==0.2.3
     # via
     #   brax
     #   flax
-orbax-checkpoint==0.5.21
+optree==0.12.1
+    # via envpool
+orbax-checkpoint==0.6.0
     # via
     #   brax
     #   flax
-ovld==0.3.5
+ovld==0.3.8
     # via voir
 packaging==24.1
     # via
     #   accelerate
     #   datasets
-    #   deepspeed
-    #   evaluate
+    #   envpool
     #   huggingface-hub
+    #   lightning
     #   lightning-utilities
+    #   pytorch-lightning
+    #   tensorboard
     #   tensorboardx
     #   torchmetrics
     #   transformers
 pandas==2.2.2
-    # via
-    #   datasets
-    #   evaluate
+    # via datasets
 pillow==10.4.0
     # via
+    #   -r benchmarks/huggingface/requirements.in
     #   brax
+    #   diffusers
+    #   fvcore
     #   torchvision
-protobuf==4.25.3
+portalocker==2.10.1
+    # via iopath
+protobuf==5.27.3
     # via
-    #   onnx
     #   orbax-checkpoint
     #   tensorboard
     #   tensorboardx
 psutil==5.9.8
     # via
     #   accelerate
-    #   deepspeed
     #   voir
 ptera==1.4.1
     # via voir
-py-cpuinfo==9.0.0
-    # via deepspeed
 pyarrow==17.0.0
     # via datasets
-pyarrow-hotfix==0.6
-    # via datasets
-pydantic==2.7.4
-    # via deepspeed
-pydantic-core==2.18.4
-    # via pydantic
-pydot==3.0.1
-    # via -r benchmarks/dlrm/requirements.in
+pycryptodomex==3.20.0
+    # via blobfile
 pygments==2.18.0
     # via rich
 pynvml==11.5.3
     # via voir
 pyopengl==3.1.7
     # via mujoco
-pyparsing==3.1.2
-    # via pydot
-pyre-extensions==0.0.30
-    # via torchx
-pysocks==1.7.1
-    # via requests
 python-dateutil==2.9.0.post0
     # via pandas
 pytinyrenderer==0.0.14
     # via brax
-pytorch-triton-rocm==2.3.1
+pytorch-lightning==2.4.0
+    # via lightning
+pytorch-triton-rocm==3.0.0
     # via torch
 pytz==2024.1
     # via pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
+    #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
     #   flax
+    #   fvcore
     #   huggingface-hub
+    #   lightning
     #   ml-collections
     #   omegaconf
     #   orbax-checkpoint
-    #   torchx
+    #   pytorch-lightning
     #   transformers
+    #   yacs
 reactivex==4.0.4
     # via giving
-regex==2024.5.15
-    # via transformers
-requests[socks]==2.32.3
+regex==2024.7.24
+    # via
+    #   diffusers
+    #   tiktoken
+    #   transformers
+requests==2.32.3
     # via
     #   datasets
-    #   docker
-    #   evaluate
-    #   gdown
+    #   diffusers
     #   huggingface-hub
+    #   tiktoken
     #   transformers
 rich==13.7.1
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   flax
+    #   tyro
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
+    #   diffusers
+    #   torchtune
     #   transformers
-scikit-learn==1.5.1
-    # via -r benchmarks/dlrm/requirements.in
 scipy==1.14.0
     # via
+    #   -r benchmarks/dinov2/requirements.in
     #   brax
     #   jax
     #   jaxlib
     #   jaxopt
     #   mujoco-mjx
-    #   scikit-learn
 sentencepiece==0.2.0
-    # via -r benchmarks/llama/requirements.in
+    # via
+    #   -r benchmarks/llama/requirements.in
+    #   torchtune
+shtab==1.7.1
+    # via tyro
 six==1.16.0
     # via
     #   asttokens
@@ -448,104 +435,165 @@ six==1.16.0
     #   ml-collections
     #   python-dateutil
     #   tensorboard
-soupsieve==2.5
-    # via beautifulsoup4
-sympy==1.13.0
+submitit==1.5.1
+    # via -r benchmarks/dinov2/requirements.in
+sympy==1.13.2
     # via torch
 tabulate==0.9.0
-    # via torchx
-tensorboard==2.17.0
-    # via -r benchmarks/dlrm/requirements.in
+    # via fvcore
+tensorboard==2.17.1
+    # via -r benchmarks/torchatari/requirements.in
 tensorboard-data-server==0.7.2
     # via tensorboard
 tensorboardx==2.6.2.2
     # via brax
-tensorstore==0.1.63
+tensorstore==0.1.64
     # via
     #   flax
     #   orbax-checkpoint
 termcolor==2.4.0
-    # via fire
-threadpoolctl==3.5.0
-    # via scikit-learn
+    # via
+    #   fire
+    #   fvcore
+tiktoken==0.7.0
+    # via torchtune
 tokenizers==0.19.1
     # via transformers
 toolz==0.12.1
     # via chex
-tqdm==4.66.4
+torch==2.4.0+rocm6.0
     # via
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/lightning/requirements.in
+    #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
+    #   -r benchmarks/torchvision/requirements.in
+    #   -r benchmarks/torchvision_ddp/requirements.in
+    #   accelerate
+    #   diffusers
+    #   fairscale
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+torchao==0.3.1
+    # via torchtune
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/lightning/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
+    #   -r benchmarks/torchvision/requirements.in
+    #   -r benchmarks/torchvision_ddp/requirements.in
+torchmetrics==1.4.1
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   lightning
+    #   pytorch-lightning
+torchtune==0.2.1
+    # via -r benchmarks/llm/requirements.in
+torchvision==0.19.0+rocm6.0
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/lightning/requirements.in
+    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchvision/requirements.in
+    #   -r benchmarks/torchvision_ddp/requirements.in
+tqdm==4.66.5
+    # via
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
     #   datasets
-    #   deepspeed
-    #   evaluate
-    #   gdown
+    #   fvcore
     #   huggingface-hub
-    #   torchrec
+    #   iopath
+    #   lightning
+    #   pytorch-lightning
+    #   torchtune
     #   transformers
-transformers==4.42.4
+transformers==4.44.1
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/llama/requirements.in
-trimesh==4.4.3
+trimesh==4.4.7
     # via
     #   brax
     #   mujoco-mjx
+types-protobuf==5.27.0.20240626
+    # via envpool
 typing-extensions==4.12.2
     # via
     #   brax
     #   chex
+    #   envpool
     #   etils
     #   flax
+    #   gymnasium
     #   huggingface-hub
+    #   iopath
+    #   lightning
     #   lightning-utilities
+    #   optree
     #   orbax-checkpoint
-    #   pydantic
-    #   pydantic-core
-    #   pyre-extensions
+    #   pytorch-lightning
     #   reactivex
+    #   submitit
     #   torch
-    #   typing-inspect
-typing-inspect==0.9.0
-    # via pyre-extensions
+    #   tyro
+tyro==0.8.8
+    # via -r benchmarks/torchatari/requirements.in
 tzdata==2024.1
     # via pandas
-urllib3==1.26.19
+urllib3==2.2.2
     # via
-    #   docker
+    #   blobfile
     #   requests
-    #   torchx
 varname==0.10.0
     # via giving
 voir==0.2.19
     # via
     #   -c .pin/../constraints/rocm.txt
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   -r benchmarks/stargan/requirements.in
+    #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
 werkzeug==3.0.3
     # via
     #   flask
     #   tensorboard
-xxhash==3.4.1
-    # via
-    #   datasets
-    #   evaluate
+xformers==0.0.27.post2
+    # via -r benchmarks/dinov2/requirements.in
+xxhash==3.5.0
+    # via datasets
+yacs==0.1.8
+    # via fvcore
 yarl==1.9.4
     # via aiohttp
-zipp==3.19.2
+zipp==3.20.0
     # via
     #   etils
     #   importlib-metadata
diff --git a/.pin/constraints-xpu-torch.txt b/.pin/constraints-xpu-torch.txt
index 71a3d6f33..9e4276398 100644
--- a/.pin/constraints-xpu-torch.txt
+++ b/.pin/constraints-xpu-torch.txt
@@ -2,12 +2,10 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-xpu-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
+#    pip-compile --output-file=.pin/constraints-xpu-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in constraints/extra/torch.xpu.txt
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
@@ -19,32 +17,37 @@ absl-py==2.1.0
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-    #   tensorboard
-accelerate==0.32.1
-    # via -r benchmarks/accelerate_opt/requirements.in
-aiohttp==3.9.5
+accelerate==0.33.0
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   diffusers
+aiohappyeyeballs==2.3.5
+    # via aiohttp
+aiohttp==3.10.2
     # via
     #   datasets
     #   fsspec
 aiosignal==1.3.1
     # via aiohttp
-annotated-types==0.7.0
-    # via pydantic
 antlr4-python3-runtime==4.9.3
     # via omegaconf
+argklass==1.4.4
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/llm/requirements.in
 asttokens==2.4.1
     # via giving
 async-timeout==4.0.3
     # via aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via aiohttp
-beautifulsoup4==4.12.3
-    # via gdown
 blinker==1.8.2
     # via flask
+blobfile==2.1.1
+    # via torchtune
 brax==0.10.5
     # via -r benchmarks/brax/requirements.in
-certifi==2024.6.2
+certifi==2024.7.4
     # via requests
 charset-normalizer==3.3.2
     # via requests
@@ -53,31 +56,28 @@ chex==0.1.86
 click==8.1.7
     # via flask
 cloudpickle==3.0.0
-    # via gym
+    # via
+    #   gym
+    #   submitit
 codefind==0.1.6
     # via ptera
 contextlib2==21.6.0
     # via ml-collections
 datasets==2.20.0
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   evaluate
-deepspeed==0.14.4
-    # via -r benchmarks/accelerate_opt/requirements.in
+    #   torchtune
+diffusers[torch]==0.30.0
+    # via -r benchmarks/diffusion/requirements.in
 dill==0.3.8
     # via
     #   datasets
-    #   evaluate
     #   multiprocess
 dm-env==1.6
     # via brax
 dm-tree==0.1.8
     # via dm-env
-docker==7.1.0
-    # via torchx
-docstring-parser==0.16
-    # via torchx
 etils[epath,epy]==1.7.0
     # via
     #   brax
@@ -85,21 +85,17 @@ etils[epath,epy]==1.7.0
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-evaluate==0.4.2
-    # via -r benchmarks/accelerate_opt/requirements.in
 executing==1.2.0
     # via varname
 fairscale==0.4.13
     # via -r benchmarks/llama/requirements.in
-fbgemm-gpu==0.7.0
-    # via torchrec
 filelock==3.15.4
     # via
+    #   blobfile
     #   datasets
-    #   gdown
+    #   diffusers
     #   huggingface-hub
     #   torch
-    #   torchx
     #   transformers
 fire==0.6.0
     # via -r benchmarks/llama/requirements.in
@@ -119,53 +115,65 @@ fsspec[http]==2024.5.0
     # via
     #   datasets
     #   etils
-    #   evaluate
     #   huggingface-hub
+    #   lightning
+    #   pytorch-lightning
     #   torch
-    #   torchx
-future==1.0.0
-    # via -r benchmarks/dlrm/requirements.in
-gdown==5.2.0
-    # via -r benchmarks/stargan/requirements.in
+fvcore==0.1.5.post20221221
+    # via -r benchmarks/dinov2/requirements.in
 giving==0.4.2
     # via
     #   ptera
     #   voir
 glfw==2.7.0
     # via mujoco
-graphviz==0.20.3
-    # via torchviz
-grpcio==1.65.1
-    # via
-    #   brax
-    #   tensorboard
+grpcio==1.65.4
+    # via brax
 gym==0.26.2
     # via brax
 gym-notices==0.0.8
     # via gym
 hjson==3.1.0
-    # via deepspeed
-huggingface-hub==0.24.0
+    # via argklass
+huggingface-hub==0.24.5
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
-    #   evaluate
+    #   diffusers
     #   tokenizers
+    #   torchtune
     #   transformers
 idna==3.7
     # via
     #   requests
     #   yarl
-importlib-metadata==8.0.0
-    # via torchx
+importlib-metadata==8.2.0
+    # via diffusers
 importlib-resources==6.4.0
     # via
+    #   argklass
     #   etils
     #   torchcompat
+intel-extension-for-openxla==0.3.0
+    # via
+    #   -c .pin/../constraints/xpu.txt
+    #   -r constraints/extra/torch.xpu.txt
+intel-extension-for-pytorch==2.3.100
+    # via
+    #   -c .pin/../constraints/xpu.txt
+    #   -r constraints/extra/torch.xpu.txt
+intel-extension-for-pytorch-deepspeed==2.1.40
+    # via
+    #   -c .pin/../constraints/xpu.txt
+    #   -r constraints/extra/torch.xpu.txt
+iopath==0.1.10
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
 itsdangerous==2.2.0
     # via flask
-jax[cuda12]==0.4.30
+jax==0.4.31
     # via
     #   -r benchmarks/brax/requirements.in
     #   brax
@@ -175,11 +183,7 @@ jax[cuda12]==0.4.30
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.30
-    # via jax-cuda12-plugin
-jax-cuda12-plugin[with-cuda]==0.4.30
-    # via jax
-jaxlib==0.4.30
+jaxlib==0.4.31
     # via
     #   brax
     #   chex
@@ -195,12 +199,15 @@ jinja2==3.1.4
     #   brax
     #   flask
     #   torch
-joblib==1.4.2
-    # via scikit-learn
-lightning-utilities==0.11.5
-    # via torchmetrics
-markdown==3.6
-    # via tensorboard
+lightning==2.4.0
+    # via -r benchmarks/lightning/requirements.in
+lightning-utilities==0.11.6
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+lxml==4.9.4
+    # via blobfile
 markdown-it-py==3.0.0
     # via rich
 markupsafe==2.1.5
@@ -222,100 +229,66 @@ msgpack==1.0.8
     # via
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.2
     # via
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.2
     # via brax
 multidict==6.0.5
     # via
     #   aiohttp
     #   yarl
 multiprocess==0.70.16
-    # via
-    #   datasets
-    #   evaluate
-mypy-extensions==1.0.0
-    # via typing-inspect
+    # via datasets
 nest-asyncio==1.6.0
     # via orbax-checkpoint
 networkx==3.3
     # via torch
-ninja==1.11.1.1
-    # via deepspeed
 numpy==1.26.4
     # via
-    #   -r benchmarks/dlrm/requirements.in
-    #   -r benchmarks/stargan/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   accelerate
     #   brax
     #   chex
     #   datasets
-    #   deepspeed
+    #   diffusers
     #   dm-env
-    #   evaluate
     #   fairscale
-    #   fbgemm-gpu
     #   flax
+    #   fvcore
     #   gym
+    #   intel-extension-for-openxla
+    #   intel-extension-for-pytorch
     #   jax
     #   jaxlib
     #   jaxopt
     #   ml-dtypes
     #   mujoco
-    #   onnx
     #   opencv-python
     #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   pandas
     #   pyarrow
-    #   scikit-learn
     #   scipy
-    #   tensorboard
     #   tensorboardx
     #   tensorstore
     #   torchmetrics
+    #   torchtune
     #   torchvision
     #   transformers
     #   trimesh
-nvidia-cublas-cu12==12.5.3.2
-    # via
-    #   jax-cuda12-plugin
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-nvidia-cuda-cupti-cu12==12.5.82
-    # via jax-cuda12-plugin
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via jax-cuda12-plugin
-nvidia-cuda-runtime-cu12==12.5.82
-    # via jax-cuda12-plugin
-nvidia-cudnn-cu12==9.2.1.18
-    # via jax-cuda12-plugin
-nvidia-cufft-cu12==11.2.3.61
-    # via jax-cuda12-plugin
-nvidia-cusolver-cu12==11.6.3.83
-    # via jax-cuda12-plugin
-nvidia-cusparse-cu12==12.5.1.3
-    # via
-    #   jax-cuda12-plugin
-    #   nvidia-cusolver-cu12
-nvidia-ml-py==12.555.43
-    # via deepspeed
-nvidia-nccl-cu12==2.22.3
-    # via jax-cuda12-plugin
-nvidia-nvjitlink-cu12==12.5.82
-    # via
-    #   jax-cuda12-plugin
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
+    #   xformers
 omegaconf==2.3.0
-    # via voir
-onnx==1.16.1
-    # via -r benchmarks/dlrm/requirements.in
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   torchtune
+    #   voir
+oneccl-bind-pt==2.1.400+xpu
+    # via
+    #   -c .pin/../constraints/xpu.txt
+    #   -r constraints/extra/torch.xpu.txt
 opencv-python==4.10.0.84
     # via -r benchmarks/super-slomo/requirements.in
 opt-einsum==3.3.0
@@ -324,221 +297,226 @@ optax==0.2.3
     # via
     #   brax
     #   flax
-orbax-checkpoint==0.5.21
+orbax-checkpoint==0.5.23
     # via
     #   brax
     #   flax
-ovld==0.3.5
+ovld==0.3.8
     # via voir
 packaging==24.1
     # via
     #   accelerate
     #   datasets
-    #   deepspeed
-    #   evaluate
     #   huggingface-hub
+    #   intel-extension-for-pytorch
+    #   lightning
     #   lightning-utilities
+    #   pytorch-lightning
     #   tensorboardx
     #   torchmetrics
     #   transformers
 pandas==2.2.2
-    # via
-    #   datasets
-    #   evaluate
+    # via datasets
 pillow==10.4.0
     # via
+    #   -r benchmarks/huggingface/requirements.in
     #   brax
+    #   diffusers
+    #   fvcore
     #   torchvision
-protobuf==4.25.3
+portalocker==2.10.1
+    # via iopath
+protobuf==5.27.3
     # via
-    #   onnx
     #   orbax-checkpoint
-    #   tensorboard
     #   tensorboardx
 psutil==5.9.8
     # via
     #   accelerate
-    #   deepspeed
+    #   intel-extension-for-pytorch
     #   voir
 ptera==1.4.1
     # via voir
-py-cpuinfo==9.0.0
-    # via deepspeed
 pyarrow==17.0.0
     # via datasets
 pyarrow-hotfix==0.6
     # via datasets
-pydantic==2.7.4
-    # via deepspeed
-pydantic-core==2.18.4
-    # via pydantic
-pydot==3.0.1
-    # via -r benchmarks/dlrm/requirements.in
+pycryptodomex==3.20.0
+    # via blobfile
 pygments==2.18.0
     # via rich
 pynvml==11.5.3
     # via voir
 pyopengl==3.1.7
     # via mujoco
-pyparsing==3.1.2
-    # via pydot
-pyre-extensions==0.0.30
-    # via torchx
-pysocks==1.7.1
-    # via requests
 python-dateutil==2.9.0.post0
     # via pandas
 pytinyrenderer==0.0.14
     # via brax
+pytorch-lightning==2.4.0
+    # via lightning
 pytz==2024.1
     # via pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
+    #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
     #   flax
+    #   fvcore
     #   huggingface-hub
+    #   lightning
     #   ml-collections
     #   omegaconf
     #   orbax-checkpoint
-    #   torchx
+    #   pytorch-lightning
     #   transformers
+    #   yacs
 reactivex==4.0.4
     # via giving
-regex==2024.5.15
-    # via transformers
-requests[socks]==2.32.3
+regex==2024.7.24
+    # via
+    #   diffusers
+    #   tiktoken
+    #   transformers
+requests==2.32.3
     # via
     #   datasets
-    #   docker
-    #   evaluate
-    #   gdown
+    #   diffusers
     #   huggingface-hub
-    #   torchvision
+    #   tiktoken
     #   transformers
 rich==13.7.1
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   flax
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
+    #   diffusers
+    #   torchtune
     #   transformers
-scikit-learn==1.5.1
-    # via -r benchmarks/dlrm/requirements.in
-scipy==1.14.0
+scipy==1.11.4
     # via
+    #   -r benchmarks/dinov2/requirements.in
     #   brax
+    #   intel-extension-for-openxla
     #   jax
     #   jaxlib
     #   jaxopt
     #   mujoco-mjx
-    #   scikit-learn
 sentencepiece==0.2.0
-    # via -r benchmarks/llama/requirements.in
+    # via
+    #   -r benchmarks/llama/requirements.in
+    #   torchtune
 six==1.16.0
     # via
     #   asttokens
     #   fire
     #   ml-collections
     #   python-dateutil
-    #   tensorboard
-soupsieve==2.5
-    # via beautifulsoup4
-sympy==1.13.0
+submitit==1.5.1
+    # via -r benchmarks/dinov2/requirements.in
+sympy==1.13.1
     # via torch
 tabulate==0.9.0
-    # via torchx
-tensorboard==2.17.0
-    # via -r benchmarks/dlrm/requirements.in
-tensorboard-data-server==0.7.2
-    # via tensorboard
+    # via fvcore
 tensorboardx==2.6.2.2
     # via brax
-tensorstore==0.1.63
+tensorstore==0.1.64
     # via
     #   flax
     #   orbax-checkpoint
 termcolor==2.4.0
-    # via fire
-threadpoolctl==3.5.0
-    # via scikit-learn
+    # via
+    #   fire
+    #   fvcore
+tiktoken==0.7.0
+    # via torchtune
 tokenizers==0.19.1
     # via transformers
 toolz==0.12.1
     # via chex
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../constraints/xpu.txt
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   -r benchmarks/stargan/requirements.in
+    #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r constraints/extra/torch.xpu.txt
     #   accelerate
-    #   deepspeed
+    #   diffusers
     #   fairscale
+    #   lightning
+    #   pytorch-lightning
     #   torchaudio
     #   torchmetrics
     #   torchvision
-    #   torchviz
-torchaudio==2.1.0.post2+cxx11.abi
+    #   xformers
+torchao==0.3.1+cpu
+    # via torchtune
+torchaudio==2.4.0+cpu
     # via
     #   -c .pin/../constraints/xpu.txt
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r constraints/extra/torch.xpu.txt
 torchcompat==1.1.4
     # via
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchmetrics==1.0.3
-    # via torchrec
-torchrec==0.7.0
-    # via -r benchmarks/dlrm/requirements.in
-torchvision==0.16.0.post2+cxx11.abi
+    #   -r constraints/extra/torch.xpu.txt
+torchmetrics==1.4.1
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   lightning
+    #   pytorch-lightning
+torchtune==0.2.1+cpu
+    # via -r benchmarks/llm/requirements.in
+torchvision==0.19.0+cpu
     # via
     #   -c .pin/../constraints/xpu.txt
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
-    #   -r benchmarks/stargan/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchviz==0.0.2
-    # via -r benchmarks/dlrm/requirements.in
-torchx==0.7.0
-    # via -r benchmarks/dlrm/requirements.in
-tqdm==4.66.4
+    #   -r constraints/extra/torch.xpu.txt
+tqdm==4.66.5
     # via
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
     #   datasets
-    #   deepspeed
-    #   evaluate
-    #   gdown
+    #   fvcore
     #   huggingface-hub
-    #   torchrec
+    #   iopath
+    #   lightning
+    #   pytorch-lightning
+    #   torchtune
     #   transformers
-transformers==4.42.4
+transformers==4.44.0
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/llama/requirements.in
-trimesh==4.4.3
+trimesh==4.4.4
     # via
     #   brax
     #   mujoco-mjx
@@ -549,47 +527,48 @@ typing-extensions==4.12.2
     #   etils
     #   flax
     #   huggingface-hub
+    #   iopath
+    #   lightning
     #   lightning-utilities
     #   orbax-checkpoint
-    #   pydantic
-    #   pydantic-core
-    #   pyre-extensions
+    #   pytorch-lightning
     #   reactivex
+    #   submitit
     #   torch
-    #   typing-inspect
-typing-inspect==0.9.0
-    # via pyre-extensions
 tzdata==2024.1
     # via pandas
-urllib3==1.26.19
+urllib3==2.2.2
     # via
-    #   docker
+    #   blobfile
     #   requests
-    #   torchx
 varname==0.10.0
     # via giving
 voir==0.2.19
     # via
     #   -c .pin/../constraints/xpu.txt
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   -r benchmarks/stargan/requirements.in
+    #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r constraints/extra/torch.xpu.txt
 werkzeug==3.0.3
-    # via
-    #   flask
-    #   tensorboard
+    # via flask
+wheel==0.44.0
+    # via intel-extension-for-openxla
+xformers==0.0.27.post2
+    # via -r benchmarks/dinov2/requirements.in
 xxhash==3.4.1
-    # via
-    #   datasets
-    #   evaluate
+    # via datasets
+yacs==0.1.8
+    # via fvcore
 yarl==1.9.4
     # via aiohttp
 zipp==3.19.2
diff --git a/README.md b/README.md
index 5531a1253..526398938 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ It will include all of the necessary data
     docker run -it --rm --ipc=host --gpus=all      \
           -v $(pwd)/results:/milabench/envs/runs   \
           $MILABENCH_IMAGE                         \
-          milabench run
+          bash -c "milabench prepare && milabench run"
 
     =================
     Benchmark results
diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/brax/requirements.cuda.txt
index db4ddb1c7..21e15c3bc 100644
--- a/benchmarks/brax/requirements.cuda.txt
+++ b/benchmarks/brax/requirements.cuda.txt
@@ -90,11 +90,11 @@ flask-cors==4.0.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-flax==0.8.5
+flax==0.9.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   etils
@@ -108,11 +108,11 @@ glfw==2.7.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   mujoco
-grpcio==1.65.2
+grpcio==1.66.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-gym==0.26.2
+gym==0.23.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
@@ -120,7 +120,11 @@ gym-notices==0.0.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   gym
-importlib-resources==6.4.0
+humanize==4.10.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   orbax-checkpoint
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   etils
@@ -128,7 +132,7 @@ itsdangerous==2.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flask
-jax[cuda12]==0.4.31
+jax==0.4.31
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/brax/requirements.in
@@ -139,14 +143,6 @@ jax[cuda12]==0.4.31
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.31
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
-jax-cuda12-plugin[with-cuda]==0.4.31
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax
 jaxlib==0.4.31
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -199,12 +195,12 @@ msgpack==1.0.8
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
@@ -222,7 +218,6 @@ numpy==1.26.4
     #   brax
     #   chex
     #   dm-env
-    #   flax
     #   gym
     #   jax
     #   jaxlib
@@ -239,19 +234,13 @@ numpy==1.26.4
 nvidia-cublas-cu12==12.1.3.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   torch
-nvidia-cuda-nvcc-cu12==12.6.20
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -259,17 +248,14 @@ nvidia-cuda-nvrtc-cu12==12.1.105
 nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   torch
 nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   torch
 nvidia-curand-cu12==10.3.2.106
     # via
@@ -278,23 +264,23 @@ nvidia-curand-cu12==10.3.2.106
 nvidia-cusolver-cu12==11.4.5.107
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   torch
 nvidia-nvjitlink-cu12==12.6.20
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
@@ -314,12 +300,12 @@ optax==0.2.3
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   flax
-orbax-checkpoint==0.5.23
+orbax-checkpoint==0.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   flax
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -348,10 +334,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
 pyopengl==3.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -360,7 +342,7 @@ pytinyrenderer==0.0.14
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
@@ -371,12 +353,12 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
     #   voir
-scipy==1.14.0
+scipy==1.14.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
@@ -389,7 +371,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   ml-collections
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -397,7 +379,7 @@ tensorboardx==2.6.2.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-tensorstore==0.1.63
+tensorstore==0.1.64
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
@@ -410,7 +392,7 @@ torch==2.4.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/brax/requirements.in
-trimesh==4.4.3
+trimesh==4.4.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
@@ -438,11 +420,11 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/brax/requirements.in
-werkzeug==3.0.3
+werkzeug==3.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flask
-zipp==3.19.2
+zipp==3.20.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   etils
diff --git a/benchmarks/brax/requirements.in b/benchmarks/brax/requirements.in
index 8221238cf..cb8584f98 100644
--- a/benchmarks/brax/requirements.in
+++ b/benchmarks/brax/requirements.in
@@ -1,5 +1,4 @@
-jax[cuda12]
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+jax
 torch
 brax
 voir>=0.2.19,<0.3
diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt
index 22646c6c3..0c14e04d9 100644
--- a/benchmarks/brax/requirements.rocm.txt
+++ b/benchmarks/brax/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/brax/requirements.rocm.txt .pin/tmp-constraints-rocm-brax.txt benchmarks/brax/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
@@ -94,7 +91,7 @@ flax==0.8.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   etils
@@ -108,11 +105,11 @@ glfw==2.7.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   mujoco
-grpcio==1.65.1
+grpcio==1.65.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-gym==0.26.2
+gym==0.23.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -120,7 +117,11 @@ gym-notices==0.0.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gym
-importlib-resources==6.4.0
+humanize==4.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   orbax-checkpoint
+importlib-resources==6.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   etils
@@ -128,7 +129,7 @@ itsdangerous==2.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flask
-jax[cuda12]==0.4.30
+jax==0.4.31
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/brax/requirements.in
@@ -139,15 +140,7 @@ jax[cuda12]==0.4.30
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.30
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-jax-cuda12-plugin[with-cuda]==0.4.30
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax
-jaxlib==0.4.30
+jaxlib==0.4.31
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -199,12 +192,12 @@ msgpack==1.0.8
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -236,52 +229,6 @@ numpy==1.26.4
     #   tensorboardx
     #   tensorstore
     #   trimesh
-nvidia-cublas-cu12==12.5.3.2
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-nvidia-cuda-cupti-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cuda-runtime-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cudnn-cu12==9.2.1.18
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cufft-cu12==11.2.3.61
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cusolver-cu12==11.6.3.83
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cusparse-cu12==12.5.1.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-    #   nvidia-cusolver-cu12
-nvidia-nccl-cu12==2.22.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-nvidia-nvjitlink-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   jax-cuda12-plugin
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -295,12 +242,12 @@ optax==0.2.3
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   flax
-orbax-checkpoint==0.5.21
+orbax-checkpoint==0.6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   flax
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -312,7 +259,7 @@ pillow==10.4.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-protobuf==4.25.3
+protobuf==5.27.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   orbax-checkpoint
@@ -341,11 +288,11 @@ pytinyrenderer==0.0.14
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flax
@@ -374,7 +321,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   ml-collections
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -382,7 +329,7 @@ tensorboardx==2.6.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-tensorstore==0.1.63
+tensorstore==0.1.64
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flax
@@ -391,11 +338,11 @@ toolz==0.12.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   chex
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/brax/requirements.in
-trimesh==4.4.3
+trimesh==4.4.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -423,7 +370,7 @@ werkzeug==3.0.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flask
-zipp==3.19.2
+zipp==3.20.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   etils
diff --git a/benchmarks/brax/requirements.xpu.txt b/benchmarks/brax/requirements.xpu.txt
index 7a2405d7d..5e7dbe294 100644
--- a/benchmarks/brax/requirements.xpu.txt
+++ b/benchmarks/brax/requirements.xpu.txt
@@ -4,10 +4,8 @@
 #
 #    pip-compile --output-file=benchmarks/brax/requirements.xpu.txt .pin/tmp-constraints-xpu-brax.txt benchmarks/brax/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
@@ -107,7 +105,7 @@ glfw==2.7.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   mujoco
-grpcio==1.65.1
+grpcio==1.65.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
@@ -127,7 +125,7 @@ itsdangerous==2.2.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   flask
-jax[cuda12]==0.4.30
+jax==0.4.31
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/brax/requirements.in
@@ -138,15 +136,7 @@ jax[cuda12]==0.4.30
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.30
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-jax-cuda12-plugin[with-cuda]==0.4.30
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax
-jaxlib==0.4.30
+jaxlib==0.4.31
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
@@ -198,12 +188,12 @@ msgpack==1.0.8
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
@@ -235,52 +225,6 @@ numpy==1.26.4
     #   tensorboardx
     #   tensorstore
     #   trimesh
-nvidia-cublas-cu12==12.5.3.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-nvidia-cuda-cupti-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cuda-runtime-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cudnn-cu12==9.2.1.18
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cufft-cu12==11.2.3.61
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cusolver-cu12==11.6.3.83
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-cusparse-cu12==12.5.1.3
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-    #   nvidia-cusolver-cu12
-nvidia-nccl-cu12==2.22.3
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-nvidia-nvjitlink-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   jax-cuda12-plugin
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -294,12 +238,12 @@ optax==0.2.3
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
     #   flax
-orbax-checkpoint==0.5.21
+orbax-checkpoint==0.5.23
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
     #   flax
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -311,7 +255,7 @@ pillow==10.4.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
-protobuf==4.25.3
+protobuf==5.27.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   orbax-checkpoint
@@ -340,7 +284,7 @@ pytinyrenderer==0.0.14
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   flax
@@ -356,7 +300,7 @@ rich==13.7.1
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   flax
     #   voir
-scipy==1.14.0
+scipy==1.11.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
@@ -369,7 +313,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
     #   ml-collections
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
@@ -377,7 +321,7 @@ tensorboardx==2.6.2.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
-tensorstore==0.1.63
+tensorstore==0.1.64
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   flax
@@ -386,12 +330,13 @@ toolz==0.12.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   chex
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/brax/requirements.in
-trimesh==4.4.3
+trimesh==4.4.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   brax
@@ -413,6 +358,7 @@ varname==0.10.0
 voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/brax/requirements.in
 werkzeug==3.0.3
diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py
index bd6668dab..2b4fe9bfd 100644
--- a/benchmarks/diffusion/main.py
+++ b/benchmarks/diffusion/main.py
@@ -4,8 +4,6 @@
 
 import math
 import random
-from contextlib import nullcontext
-from pathlib import Path
 
 import numpy as np
 import torch
@@ -14,7 +12,6 @@
 from accelerate import Accelerator
 from datasets import load_dataset
 from torchvision import transforms
-from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer
 
 from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
@@ -44,6 +41,7 @@ class Arguments:
     lr_scheduler: str = "constant"
     lr_warmup_steps: int = 500
     epochs: int = 10
+    cache: str = None
 
 
 def models(accelerator, args: Arguments):
diff --git a/benchmarks/diffusion/prepare.py b/benchmarks/diffusion/prepare.py
index be7de0312..ed9e3f333 100755
--- a/benchmarks/diffusion/prepare.py
+++ b/benchmarks/diffusion/prepare.py
@@ -2,10 +2,6 @@
 from dataclasses import dataclass
 import os
 
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
-from datasets import load_dataset
 
 
 @dataclass
@@ -14,6 +10,7 @@ class TrainingConfig:
     dataset: str = "lambdalabs/naruto-blip-captions"
     revision: str = None
     variant: str = None
+    cache: str = None
 
 
 def main():
@@ -22,6 +19,16 @@ def main():
     parser = ArgumentParser()
     parser.add_arguments(TrainingConfig)
     args, _ = parser.parse_known_args()
+    # --
+
+    if args.cache:
+        os.environ["XDG_CACHE_HOME"] = str(args.cache)
+
+    # --
+    from transformers import CLIPTextModel, CLIPTokenizer
+    from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
+    from datasets import load_dataset
+
 
     _ = load_dataset(args.dataset)
 
diff --git a/benchmarks/diffusion/requirements.cuda.txt b/benchmarks/diffusion/requirements.cuda.txt
index 250051be7..74ce5bd0a 100644
--- a/benchmarks/diffusion/requirements.cuda.txt
+++ b/benchmarks/diffusion/requirements.cuda.txt
@@ -14,11 +14,11 @@ accelerate==0.33.0
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
     #   diffusers
-aiohappyeyeballs==2.3.4
+aiohappyeyeballs==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-aiohttp==3.10.0
+aiohttp==3.10.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -43,7 +43,7 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -59,11 +59,11 @@ codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.20.0
+datasets==2.21.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
-diffusers[torch]==0.29.2
+diffusers[torch]==0.30.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
@@ -90,7 +90,7 @@ frozenlist==1.4.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -105,7 +105,7 @@ hjson==3.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
-huggingface-hub==0.24.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
@@ -113,16 +113,16 @@ huggingface-hub==0.24.5
     #   diffusers
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   yarl
-importlib-metadata==8.2.0
+importlib-metadata==8.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   diffusers
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
@@ -208,6 +208,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -225,7 +229,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -258,18 +262,10 @@ pyarrow==17.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-pyarrow-hotfix==0.6
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   datasets
 pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -278,7 +274,7 @@ pytz==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
@@ -302,11 +298,11 @@ requests==2.32.3
     #   diffusers
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
@@ -317,7 +313,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   python-dateutil
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -335,14 +331,14 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.43.3
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
@@ -373,7 +369,7 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/diffusion/requirements.in
-xxhash==3.4.1
+xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -381,7 +377,7 @@ yarl==1.9.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-zipp==3.19.2
+zipp==3.20.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   importlib-metadata
diff --git a/benchmarks/diffusion/requirements.rocm.txt b/benchmarks/diffusion/requirements.rocm.txt
new file mode 100644
index 000000000..5d0fd6e3f
--- /dev/null
+++ b/benchmarks/diffusion/requirements.rocm.txt
@@ -0,0 +1,328 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/diffusion/requirements.rocm.txt .pin/tmp-constraints-rocm-diffusion-nodes.txt benchmarks/diffusion/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+accelerate==0.33.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+    #   diffusers
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+datasets==2.21.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+diffusers[torch]==0.30.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   multiprocess
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   pytorch-triton-rocm
+    #   torch
+    #   transformers
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   argklass
+huggingface-hub==0.24.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   tokenizers
+    #   transformers
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+    #   yarl
+importlib-metadata==8.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   diffusers
+importlib-resources==6.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   argklass
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   pandas
+    #   pyarrow
+    #   torchvision
+    #   transformers
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   diffusers
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+regex==2024.7.24
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   diffusers
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   transformers
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+safetensors==0.4.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   diffusers
+    #   transformers
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   transformers
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   diffusers
+    #   torchvision
+torchvision==0.19.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   reactivex
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/diffusion/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+zipp==3.20.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   importlib-metadata
diff --git a/benchmarks/diffusion/requirements.xpu.txt b/benchmarks/diffusion/requirements.xpu.txt
new file mode 100644
index 000000000..62a1aba1e
--- /dev/null
+++ b/benchmarks/diffusion/requirements.xpu.txt
@@ -0,0 +1,333 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/diffusion/requirements.xpu.txt .pin/tmp-constraints-xpu-diffusion-nodes.txt benchmarks/diffusion/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
+accelerate==0.33.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+    #   diffusers
+aiohappyeyeballs==2.3.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   requests
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   ptera
+datasets==2.20.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+diffusers[torch]==0.30.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+    #   multiprocess
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.5.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   ptera
+    #   voir
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   argklass
+huggingface-hub==0.24.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   tokenizers
+    #   transformers
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   requests
+    #   yarl
+importlib-metadata==8.2.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   diffusers
+importlib-resources==6.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   argklass
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   pandas
+    #   pyarrow
+    #   torchvision
+    #   transformers
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   diffusers
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+pyarrow-hotfix==0.6
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   pandas
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+regex==2024.7.24
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   diffusers
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   transformers
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+safetensors==0.4.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   accelerate
+    #   diffusers
+    #   transformers
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   transformers
+torch==2.4.0+cpu
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   accelerate
+    #   diffusers
+    #   torchvision
+torchvision==0.19.0+cpu
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/diffusion/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   huggingface-hub
+    #   reactivex
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/diffusion/requirements.in
+xxhash==3.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   datasets
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+zipp==3.19.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   importlib-metadata
diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py
index ddfc4bc06..214a013f8 100644
--- a/benchmarks/dinov2/benchfile.py
+++ b/benchmarks/dinov2/benchfile.py
@@ -3,8 +3,8 @@
 
 
 SOURCE_DIR = "src"
-REPO_URL = "https://github.com/facebookresearch/dinov2"
-BRANCH = "e1277af2ba9496fbadf7aec6eba56e8d882d1e35"
+REPO_URL = "https://github.com/Delaunay/dinov2"
+BRANCH = "451bc15a084f42cc97c21e3bc0be9e9158f9049c"
 
 
 class Dinov2(Package):
@@ -28,7 +28,8 @@ def working_directory(self):
     def make_env(self):
         # Return a dict of environment variables for prepare_script and
         # main_script.
-        return super().make_env()
+        env = super().make_env()
+        return env
 
     async def install(self):
         await super().install()
diff --git a/benchmarks/dinov2/requirements.cuda.txt b/benchmarks/dinov2/requirements.cuda.txt
index 4a29be4a7..16e579e9d 100644
--- a/benchmarks/dinov2/requirements.cuda.txt
+++ b/benchmarks/dinov2/requirements.cuda.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/dinov2/requirements.cuda.txt .pin/tmp-constraints-cuda-dinov2-giant-nodes.txt benchmarks/dinov2/requirements.in
+#    pip-compile --output-file=benchmarks/dinov2/requirements.cuda.txt .pin/tmp-constraints-cuda-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
@@ -34,7 +34,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -127,6 +127,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -145,7 +149,7 @@ omegaconf==2.3.0
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -175,11 +179,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   fvcore
@@ -189,11 +189,11 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-scipy==1.14.0
+scipy==1.14.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
@@ -205,7 +205,7 @@ submitit==1.5.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -224,7 +224,7 @@ torch==2.4.0+cu121
     #   torchmetrics
     #   torchvision
     #   xformers
-torchmetrics==1.4.0.post0
+torchmetrics==1.4.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
@@ -232,7 +232,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   fvcore
diff --git a/benchmarks/dinov2/requirements.rocm.txt b/benchmarks/dinov2/requirements.rocm.txt
new file mode 100644
index 000000000..c46ba9819
--- /dev/null
+++ b/benchmarks/dinov2/requirements.rocm.txt
@@ -0,0 +1,216 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/dinov2/requirements.rocm.txt .pin/tmp-constraints-rocm-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   submitit
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+    #   torch
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+fvcore==0.1.5.post20221221
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+iopath==0.1.10
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+lightning-utilities==0.11.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchmetrics
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+    #   scipy
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning-utilities
+    #   torchmetrics
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+    #   torchvision
+portalocker==2.10.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   iopath
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+    #   omegaconf
+    #   yacs
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+scipy==1.14.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+submitit==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tabulate==0.9.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+termcolor==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+torchmetrics==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+torchvision==0.19.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+    #   iopath
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   iopath
+    #   lightning-utilities
+    #   reactivex
+    #   submitit
+    #   torch
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/dinov2/requirements.in
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+yacs==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fvcore
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/dinov2/requirements.xpu.txt b/benchmarks/dinov2/requirements.xpu.txt
new file mode 100644
index 000000000..032296c6f
--- /dev/null
+++ b/benchmarks/dinov2/requirements.xpu.txt
@@ -0,0 +1,217 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/dinov2/requirements.xpu.txt .pin/tmp-constraints-xpu-dinov2-giant-nodes.txt benchmarks/dinov2/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   submitit
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+fsspec==2024.5.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+fvcore==0.1.5.post20221221
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   ptera
+    #   voir
+iopath==0.1.10
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+lightning-utilities==0.11.6
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torchmetrics
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+    #   scipy
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning-utilities
+    #   torchmetrics
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+    #   torchvision
+portalocker==2.10.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   iopath
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+    #   omegaconf
+    #   yacs
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+scipy==1.11.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   asttokens
+submitit==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+sympy==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+tabulate==0.9.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+termcolor==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+torch==2.4.0+cpu
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+torchmetrics==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+torchvision==0.19.0+cpu
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/dinov2/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+    #   iopath
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   iopath
+    #   lightning-utilities
+    #   reactivex
+    #   submitit
+    #   torch
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/dinov2/requirements.in
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+yacs==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fvcore
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/dinov2/voirfile.py b/benchmarks/dinov2/voirfile.py
index f358914dc..fdc616b83 100644
--- a/benchmarks/dinov2/voirfile.py
+++ b/benchmarks/dinov2/voirfile.py
@@ -26,11 +26,30 @@ class Config:
     gpu_poll: int = 3
 
 
+def populate_slurm():
+    import json
+    import os
+
+    config = json.loads(os.environ["MILABENCH_CONFIG"])
+
+    nodes = [n["name"] for n in config["system"]["nodes"]]
+
+    env = {
+        "SLURM_JOB_ID": "123",
+        "SLURM_JOB_NUM_NODES": "2",
+        "SLURM_JOB_NODELIST": ",".join(nodes),
+        "SLURM_NTASKS": str(len(config["system"]["nodes"])),
+        "SLURM_PROCID":  "2",   # RANK
+        "SLURM_LOCALID": "1",   # Local RANK
+    }
+
+
 @configurable
 def instrument_main(ov, options: Config):
+    import os
+
     yield ov.phases.init
 
-    import os
     import sys
     sys.path.append(os.path.dirname(__file__) + "/src/")
 
diff --git a/benchmarks/flops/activator b/benchmarks/flops/activator
deleted file mode 100755
index 083c28cb1..000000000
--- a/benchmarks/flops/activator
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-venv="$1"
-shift
-
-source "$venv"/bin/activate
-exec "$@"
diff --git a/benchmarks/flops/requirements.cuda.txt b/benchmarks/flops/requirements.cuda.txt
index 75ea2eb79..8553edece 100644
--- a/benchmarks/flops/requirements.cuda.txt
+++ b/benchmarks/flops/requirements.cuda.txt
@@ -30,7 +30,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -39,7 +39,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchcompat
@@ -110,6 +110,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -127,7 +131,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -147,11 +151,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
@@ -159,7 +159,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -167,7 +167,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -185,7 +185,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/flops/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/flops/requirements.in
diff --git a/benchmarks/flops/requirements.rocm.txt b/benchmarks/flops/requirements.rocm.txt
index 953732347..d9ac15eb5 100644
--- a/benchmarks/flops/requirements.rocm.txt
+++ b/benchmarks/flops/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/flops/requirements.rocm.txt .pin/tmp-constraints-rocm-flops.txt benchmarks/flops/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -30,7 +27,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
     #   torch
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -39,7 +36,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -75,7 +72,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -99,11 +96,11 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
@@ -119,11 +116,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/flops/requirements.in
@@ -133,11 +130,11 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/flops/requirements.in
-torchvision==0.18.1+rocm6.0
+torchvision==0.19.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/flops/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/flops/requirements.in
diff --git a/benchmarks/flops/requirements.xpu.txt b/benchmarks/flops/requirements.xpu.txt
index ed57d25f6..087e29b9a 100644
--- a/benchmarks/flops/requirements.xpu.txt
+++ b/benchmarks/flops/requirements.xpu.txt
@@ -4,10 +4,8 @@
 #
 #    pip-compile --output-file=benchmarks/flops/requirements.xpu.txt .pin/tmp-constraints-xpu-flops.txt benchmarks/flops/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,14 +15,6 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-certifi==2024.6.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
-charset-normalizer==3.3.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -46,10 +36,6 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-idna==3.7
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 importlib-resources==6.4.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -86,7 +72,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -110,7 +96,7 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   omegaconf
@@ -118,10 +104,6 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-requests==2.32.3
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   torchvision
 rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -130,27 +112,30 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/flops/requirements.in
     #   torchvision
 torchcompat==1.1.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/flops/requirements.in
-torchvision==0.16.0.post2+cxx11.abi
+torchvision==0.19.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/flops/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/flops/requirements.in
@@ -159,10 +144,6 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   reactivex
     #   torch
-urllib3==1.26.19
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -170,5 +151,6 @@ varname==0.10.0
 voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/flops/requirements.in
diff --git a/benchmarks/llm/main.py b/benchmarks/geo_gnn/.pin/tmp-constraints-cuda-pna.txt
similarity index 100%
rename from benchmarks/llm/main.py
rename to benchmarks/geo_gnn/.pin/tmp-constraints-cuda-pna.txt
diff --git a/benchmarks/geo_gnn/Makefile b/benchmarks/geo_gnn/Makefile
new file mode 100644
index 000000000..7645407e2
--- /dev/null
+++ b/benchmarks/geo_gnn/Makefile
@@ -0,0 +1,35 @@
+# Use global base if possible
+ifndef MILABENCH_BASE
+	MILABENCH_BASE="base"
+endif
+
+export MILABENCH_BASE
+export MILABENCH_GPU_ARCH=cuda
+
+BENCH_NAME=geo_gnn
+MILABENCH_CONFIG=dev.yaml
+MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE)
+
+all:
+	install prepare single gpus nodes
+
+install:
+	milabench install $(MILABENCH_ARGS) --update
+
+prepare:
+	milabench prepare $(MILABENCH_ARGS)
+
+tests: # install 
+	milabench run $(MILABENCH_ARGS) --select pna
+
+single:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single
+
+gpus:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus
+
+nodes:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes
+
+pin:
+	milabench pin $(MILABENCH_ARGS)
diff --git a/benchmarks/geo_gnn/README.md b/benchmarks/geo_gnn/README.md
new file mode 100644
index 000000000..b82e3c019
--- /dev/null
+++ b/benchmarks/geo_gnn/README.md
@@ -0,0 +1,4 @@
+
+# Dimenet
+
+Rewrite this README to explain what the benchmark is!
diff --git a/benchmarks/geo_gnn/bench/models.py b/benchmarks/geo_gnn/bench/models.py
new file mode 100644
index 000000000..0868724a0
--- /dev/null
+++ b/benchmarks/geo_gnn/bench/models.py
@@ -0,0 +1,68 @@
+from types import SimpleNamespace as NS
+
+from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet
+
+models = {}
+
+
+def register_model(fn):
+    models[fn.__name__] = fn
+    return fn
+
+
+@register_model
+def DimeNet(args, sample, **extras):
+    # The directional message passing neural network (DimeNet) from the “Directional Message Passing for Molecular Graphs” paper. 
+    # DimeNet transforms messages based on the angle between them in a rotation-equivariant fashion.
+    
+    # PCQM4Mv2Subset: Data(x=[18, 9], edge_index=[2, 40], edge_attr=[40, 3], y=3.0476751256, pos=[18, 3], smiles='Cc1ccc([C@H]2[CH]c3cnccc3[N]C2=O)cc1')
+    # QM9:            Data(x=[5, 11], edge_index=[2, 8], edge_attr=[8, 4], y=[1, 19], pos=[5, 3], z=[5], smiles='[H]C([H])([H])[H]', name='gdb_1', idx=[1])
+    try:
+        batch_size, out_channels = sample.y.shape
+    except:
+        out_channels = 1
+    
+    return NS(
+        category="3d",
+        model=_DimeNet(
+            hidden_channels=64,
+            out_channels=out_channels,
+            num_blocks=6,
+            num_bilinear=8,
+            num_spherical=7,
+            num_radial=6,
+            cutoff=10.0,
+            envelope_exponent=5,
+            num_before_skip=1,
+            num_after_skip=2,
+            num_output_layers=3,
+        ),
+    )
+
+
+@register_model
+def PNA(args, sample, degree):
+    # The Graph Neural Network from the “Principal Neighbourhood Aggregation for Graph Nets” paper, 
+    # using the PNAConv operator for message passing.
+    
+    out_channels = 1
+    if hasattr(sample.y, "shape") and len(sample.y.shape) > 1:
+        out_channels = sample.y.shape[-1]
+
+    _, in_channels = sample.x.shape
+
+    return NS(
+        category="2d",
+        model=_PNA(
+            # Basic GCNN setup
+            in_channels=in_channels, 
+            out_channels=out_channels,
+            hidden_channels=64,
+            num_layers=64,
+            # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html
+            aggregators=['mean', 'min', 'max', 'std'],
+            scalers=['identity', 'amplification', 'attenuation'],
+            # Histogram of in-degrees of nodes in the training set, used by scalers to normalize
+            deg=degree(),
+        ),
+    )
diff --git a/benchmarks/geo_gnn/benchfile.py b/benchmarks/geo_gnn/benchfile.py
new file mode 100644
index 000000000..cb6565b68
--- /dev/null
+++ b/benchmarks/geo_gnn/benchfile.py
@@ -0,0 +1,30 @@
+from milabench.pack import Package
+
+
+class Dimenet(Package):
+    # Requirements file installed by install(). It can be empty or absent.
+    base_requirements = ["requirements-pre.in", "requirements.in"]
+
+    # The preparation script called by prepare(). It must be executable,
+    # but it can be any type of script. It can be empty or absent.
+    prepare_script = "prepare.py"
+
+    # The main script called by run(). It must be a Python file. It has to
+    # be present.
+    main_script = "main.py"
+
+    # You can remove the functions below if you don't need to modify them.
+
+    def make_env(self):
+        # Return a dict of environment variables for prepare_script and
+        # main_script.
+        return super().make_env()
+
+    async def install(self):
+        await super().install()  # super() call installs the requirements
+
+    async def prepare(self):
+        await super().prepare()  # super() call executes prepare_script
+
+
+__pack__ = Dimenet
diff --git a/benchmarks/geo_gnn/dev.yaml b/benchmarks/geo_gnn/dev.yaml
new file mode 100644
index 000000000..7fadaea5f
--- /dev/null
+++ b/benchmarks/geo_gnn/dev.yaml
@@ -0,0 +1,22 @@
+dimenet:
+  inherits: _defaults
+  definition: .
+  install-variant: cuda
+  install_group: torch
+  plan:
+    method: per_gpu
+  argv:
+    --model: 'DimeNet'
+    --num-samples: 10000
+    --use3d: True
+
+pna:
+  inherits: _defaults
+  definition: .
+  install-variant: cuda
+  install_group: torch
+  plan:
+    method: per_gpu
+  argv:
+    --model: 'PNA'
+    --num-samples: 10000
\ No newline at end of file
diff --git a/benchmarks/geo_gnn/main.py b/benchmarks/geo_gnn/main.py
new file mode 100644
index 000000000..714707f65
--- /dev/null
+++ b/benchmarks/geo_gnn/main.py
@@ -0,0 +1,199 @@
+import argparse
+import os
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchcompat.core as accelerator
+from bench.models import models
+from pcqm4m_subset import PCQM4Mv2Subset
+from torch_geometric.datasets import QM9
+from torch_geometric.loader import DataLoader
+
+from benchmate.observer import BenchObserver
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="Geometric GNN")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        metavar="N",
+        help="input batch size for training (default: 16)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=20,
+        metavar="N",
+        help="Number of epochs to train (default: 20)",
+    )
+    parser.add_argument("--model", type=str, help="GNN name", required=True)
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        help="Number of samples to process in the dataset",
+        default=10000,
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1e-4,
+        metavar="LR",
+        help="learning rate (default: 0.0001)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1234,
+        metavar="S",
+        help="random seed (default: 1234)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=0,
+        help="number of workers for data loading",
+    )
+    parser.add_argument(
+        "--use3d",
+        action="store_true",
+        default=False,
+        help="Use 3D coordinates with data",
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default=os.environ["MILABENCH_DIR_DATA"],
+        help="Dataset path",
+    )
+    return parser
+
+
+def train_degree(train_dataset):
+    from torch_geometric.utils import degree
+
+    # Compute the maximum in-degree in the training data.
+    max_degree = -1
+    for data in train_dataset:
+        d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
+        max_degree = max(max_degree, int(d.max()))
+
+    # Compute the in-degree histogram tensor
+    deg = torch.zeros(max_degree + 1, dtype=torch.long)
+    for data in train_dataset:
+        d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
+        deg += torch.bincount(d, minlength=deg.numel())
+
+    return deg
+
+
+def mean(self):
+    import numpy as np
+    return np.mean([self.get(i).y for i in range(len(self))])
+
+def std(self):
+    import numpy as np
+    return np.std([self.get(i).y for i in range(len(self))])
+
+
+def main():
+    args = parser().parse_args()
+
+    def batch_size(x):
+        shape = x.y.shape
+        return shape[0]
+
+    observer = BenchObserver(batch_size_fn=batch_size)
+
+    # train_dataset = PCQM4Mv2Subset(args.num_samples, args.root)
+    train_dataset = QM9(args.root)
+
+    sample = next(iter(train_dataset))
+
+    info = models[args.model](args, 
+                              sample=sample, 
+                              degree=lambda: train_degree(train_dataset),
+    )
+
+    TRAIN_mean, TRAIN_std = (
+        mean(train_dataset).item(),
+        std(train_dataset).item(),
+    )
+    print("Train mean: {}\tTrain std: {}".format(TRAIN_mean, TRAIN_std))
+
+    DataLoaderClass = DataLoader
+    dataloader_kwargs = {}
+
+    train_loader = DataLoaderClass(
+        train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+        **dataloader_kwargs
+    )
+
+    device = accelerator.fetch_device(0)
+    model = info.model.to(device)
+
+    criterion = nn.L1Loss()
+
+    # set up optimizer
+    # different learning rate for different part of GNN
+    model_param_group = [{"params": model.parameters(), "lr": args.lr}]
+    optimizer = optim.Adam(model_param_group, lr=args.lr, weight_decay=0)
+
+    lr_scheduler = None
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
+
+    num_batches = len(train_loader)
+    for epoch in range(1, args.epochs + 1):
+        model.train()
+
+        for step, batch in enumerate(observer.iterate(train_loader)):
+            # QM9            => DataBatch(x=[290, 11], edge_index=[2, 602], edge_attr=[602, 4], y=[16, 19], pos=[290, 3], z=[290], smiles=[16], name=[16], idx=[16], batch=[290], ptr=[17])
+            # PCQM4Mv2Subset => DataBatch(x=[229,  9], edge_index=[2, 476], edge_attr=[476, 3], y=[16],     pos=[229, 3],          smiles=[16],                      batch=[229], ptr=[17])
+            batch = batch.to(device)
+            
+            if args.use3d:
+                
+                if hasattr(batch, "z"):
+                    z = batch.z
+                else:
+                    z = batch.batch
+                
+                molecule_repr = model(z=z, pos=batch.pos, batch=batch.batch)
+            else:
+                molecule_repr = model(x=batch.x, batch=batch.batch, edge_index=batch.edge_index, batch_size=batch_size(batch))
+
+            pred = molecule_repr.squeeze()
+
+            # Dimenet   : pred: torch.Size([ 16, 19])
+            # PNA       : pred: torch.Size([292, 19]) <= (with x=batch.x) WTF !? 292 = batch.x.shape[0]
+            # batch     :       torch.Size([ 16, 19])
+            # print(molecule_repr.shape)
+            # print(batch.y.shape)
+            
+            B = pred.size()[0]
+            y = batch.y.view(B, -1)
+            # normalize
+            y = (y - TRAIN_mean) / TRAIN_std
+
+            loss = criterion(pred, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            lr_scheduler.step(epoch - 1 + step / num_batches)
+
+            observer.record_loss(loss)
+
+        lr_scheduler.step()
+
+        print("Epoch: {}\nLoss: {}".format(epoch))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/geo_gnn/pcqm4m_subset.py b/benchmarks/geo_gnn/pcqm4m_subset.py
new file mode 100644
index 000000000..615aea2bb
--- /dev/null
+++ b/benchmarks/geo_gnn/pcqm4m_subset.py
@@ -0,0 +1,107 @@
+import os
+import os.path as osp
+from typing import Any, Callable, Dict, List, Optional
+
+import numpy as np
+import torch
+from rdkit import Chem
+from torch_geometric.data import Data, download_url, extract_tar
+from torch_geometric.data.data import BaseData
+from torch_geometric.datasets import PCQM4Mv2
+from torch_geometric.utils import from_smiles as _from_smiles
+from tqdm import tqdm
+
+
+class PCQM4Mv2Subset(PCQM4Mv2):
+    suppl_url = "http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2-train.sdf.tar.gz"
+
+    def __init__(
+        self,
+        size: int,
+        root: str,
+        split: str = "train",
+        transform: Optional[Callable] = None,
+        backend: str = "sqlite",
+        from_smiles: Optional[Callable] = None,
+    ) -> None:
+        assert split in ["train", "val", "test", "holdout"]
+
+        self.size = size
+
+        schema = {
+            "x": dict(dtype=torch.int64, size=(-1, 9)),
+            "edge_index": dict(dtype=torch.int64, size=(2, -1)),
+            "edge_attr": dict(dtype=torch.int64, size=(-1, 3)),
+            "smiles": str,
+            "pos": dict(dtype=torch.float32, size=(-1, 3)),
+            "y": float,
+        }
+
+        self.from_smiles = from_smiles or _from_smiles
+        super(PCQM4Mv2, self).__init__(root, transform, backend=backend, schema=schema)
+
+        split_idx = torch.load(self.raw_paths[1])
+        self._indices = split_idx[self.split_mapping[split]].tolist()
+
+    def raw_file_names(self):
+        return super().raw_file_names + [
+            osp.join("pcqm4m-v2", "raw", "pcqm4m-v2-train.sdf")
+        ]
+
+    def download(self):
+        print(self.raw_paths)
+        if all(os.path.exists(path) for path in self.raw_paths):
+            return
+
+        # Download 2d graphs
+        print(self.raw_dir)
+        super().download()
+
+        # Download 3D coordinates
+        file_path = download_url(self.suppl_url, self.raw_dir)
+        # md5sum: fd72bce606e7ddf36c2a832badeec6ab
+        extract_tar(file_path, osp.join(self.raw_dir, "pcqm4m-v2", "raw"), mode="r:gz")
+        os.unlink(file_path)
+
+    def process(self) -> None:
+        import pandas as pd
+
+        df = pd.read_csv(self.raw_paths[0])
+
+        data_list: List[Data] = []
+        suppl = Chem.SDMolSupplier(self.raw_paths[-1])
+        iterator = enumerate(zip(df["smiles"], df["homolumogap"], suppl))
+        for i, (smiles, y, extra) in tqdm(iterator, total=min(len(df), self.size)):
+            # data = from_smiles(smiles)
+            data = self.from_smiles(Chem.MolToSmiles(extra))
+            data.y = y
+            data.pos = torch.tensor(
+                extra.GetConformer().GetPositions(), dtype=torch.float
+            )
+
+            data_list.append(data)
+            if (
+                i + 1 == len(df) or (i + 1) % 1000 == 0 or i >= self.size
+            ):  # Write batch-wise:
+                self.extend(data_list)
+                data_list = []
+
+            if i >= self.size:
+                break
+
+    def __len__(self):
+        return min(super().__len__(), self.size)
+
+    def len(self):
+        return min(super().len(), self.size)
+
+    def mean(self):
+        return np.mean([self.get(i).y for i in range(len(self))])
+
+    def std(self):
+        return np.std([self.get(i).y for i in range(len(self))])
+
+    def serialize(self, data: BaseData) -> Dict[str, Any]:
+        rval = super().serialize(data)
+        rval["pos"] = data.pos
+        return rval
diff --git a/benchmarks/geo_gnn/prepare.py b/benchmarks/geo_gnn/prepare.py
new file mode 100755
index 000000000..2b352f8ce
--- /dev/null
+++ b/benchmarks/geo_gnn/prepare.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+import argparse
+import os
+
+from pcqm4m_subset import PCQM4Mv2Subset
+from torch_geometric.datasets import QM9
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="Geometric GNN")
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        help="Number of samples to process in the dataset",
+        default=10000,
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default=os.environ["MILABENCH_DIR_DATA"],
+        help="Dataset path",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    args, _ = parser().parse_known_args()
+
+    # TODO: Handle argument for the number of samples
+    train_dataset = QM9(args.root)
+    # dataset = PCQM4Mv2Subset(args.num_samples, root=args.root)
+
diff --git a/benchmarks/geo_gnn/requirements-pre.cuda.txt b/benchmarks/geo_gnn/requirements-pre.cuda.txt
new file mode 100644
index 000000000..3b11e80df
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements-pre.cuda.txt
@@ -0,0 +1,104 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.cuda.txt .pin/tmp-constraints-cuda-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in
+#
+--extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--trusted-host pypi.ngc.nvidia.com
+
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   jinja2
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+torch==2.3.1+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.in
+triton==2.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
diff --git a/benchmarks/geo_gnn/requirements-pre.in b/benchmarks/geo_gnn/requirements-pre.in
new file mode 100644
index 000000000..08ed5eeb4
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements-pre.in
@@ -0,0 +1 @@
+torch
\ No newline at end of file
diff --git a/benchmarks/geo_gnn/requirements-pre.rocm.txt b/benchmarks/geo_gnn/requirements-pre.rocm.txt
new file mode 100644
index 000000000..3aded346f
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements-pre.rocm.txt
@@ -0,0 +1,49 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pytorch-triton-rocm
+    #   torch
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   jinja2
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.in
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
diff --git a/benchmarks/geo_gnn/requirements.cuda.txt b/benchmarks/geo_gnn/requirements.cuda.txt
new file mode 100644
index 000000000..8965d9007
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements.cuda.txt
@@ -0,0 +1,339 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements.cuda.txt .pin/tmp-constraints-cuda-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.cuda.txt benchmarks/geo_gnn/requirements.in
+#
+--extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--trusted-host pypi.ngc.nvidia.com
+
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+    #   torch-geometric
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   ptera
+    #   voir
+idna==3.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   scikit-learn
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+    #   yarl
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   pandas
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   torch-geometric
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   rdkit
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   rich
+pyparsing==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pandas
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   omegaconf
+rdkit==2024.3.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+rich==13.8.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+scikit-learn==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-geometric
+    #   torch-sparse
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+threadpoolctl==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   scikit-learn
+torch==2.3.1+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+torch-cluster==1.6.3+pt23cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-geometric==2.5.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-scatter==2.1.2+pt23cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-sparse==0.6.18+pt23cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+triton==2.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   reactivex
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
diff --git a/benchmarks/geo_gnn/requirements.in b/benchmarks/geo_gnn/requirements.in
new file mode 100644
index 000000000..6fbdd7dea
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements.in
@@ -0,0 +1,9 @@
+voir>=0.2.17,<0.3
+torch-geometric
+torch-cluster
+torch-sparse
+torch-scatter
+pandas
+rdkit
+numpy<2.0
+
diff --git a/benchmarks/geo_gnn/requirements.rocm.txt b/benchmarks/geo_gnn/requirements.rocm.txt
new file mode 100644
index 000000000..60246f795
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements.rocm.txt
@@ -0,0 +1,272 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.rocm.txt benchmarks/geo_gnn/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   pytorch-triton-rocm
+    #   torch
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   torch
+    #   torch-geometric
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   ptera
+    #   voir
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   scikit-learn
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+    #   yarl
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   pandas
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   torch-geometric
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   rdkit
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+pyparsing==3.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   torch
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   omegaconf
+rdkit==2024.3.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+scikit-learn==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+scipy==1.14.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-geometric
+    #   torch-sparse
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   torch
+threadpoolctl==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   scikit-learn
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+torch-cluster==1.6.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-geometric==2.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-scatter==2.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-sparse==0.6.18
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   reactivex
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
diff --git a/benchmarks/geo_gnn/voirfile.py b/benchmarks/geo_gnn/voirfile.py
new file mode 100644
index 000000000..d93f886cd
--- /dev/null
+++ b/benchmarks/geo_gnn/voirfile.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+from voir import configurable
+from voir.instruments import dash, early_stop, log, rate
+from benchmate.monitor import monitor_monogpu
+
+@dataclass
+class Config:
+    """voir configuration"""
+
+    # Whether to display the dash or not
+    dash: bool = False
+
+    # How often to log the rates
+    interval: str = "1s"
+
+    # Number of rates to skip before logging
+    skip: int = 5
+
+    # Number of rates to log before stopping
+    stop: int = 20
+
+    # Number of seconds between each gpu poll
+    gpu_poll: int = 3
+
+
+@configurable
+def instrument_main(ov, options: Config):
+    yield ov.phases.init
+
+    if options.dash:
+        ov.require(dash)
+
+    ov.require(
+        log("value", "progress", "rate", "units", "loss", "gpudata", context="task"),
+        early_stop(n=options.stop, key="rate", task="train"),
+        monitor_monogpu(poll_interval=options.gpu_poll),
+    )
diff --git a/benchmarks/huggingface/requirements.cuda.txt b/benchmarks/huggingface/requirements.cuda.txt
index 3f2e112b6..c9ba4dbdb 100644
--- a/benchmarks/huggingface/requirements.cuda.txt
+++ b/benchmarks/huggingface/requirements.cuda.txt
@@ -40,7 +40,7 @@ filelock==3.15.4
     #   torch
     #   transformers
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
@@ -50,12 +50,12 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -126,6 +126,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -143,7 +147,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -168,11 +172,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
@@ -191,11 +191,11 @@ requests==2.32.3
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   transformers
@@ -203,7 +203,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -215,12 +215,12 @@ torch==2.4.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/huggingface/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
     #   transformers
-transformers==4.43.3
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/huggingface/requirements.in
diff --git a/benchmarks/huggingface/requirements.rocm.txt b/benchmarks/huggingface/requirements.rocm.txt
index b5b910f65..1f54d841a 100644
--- a/benchmarks/huggingface/requirements.rocm.txt
+++ b/benchmarks/huggingface/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/huggingface/requirements.rocm.txt .pin/tmp-constraints-rocm-hf.txt benchmarks/huggingface/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,7 +14,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-certifi==2024.6.2
+certifi==2024.7.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -40,7 +37,7 @@ filelock==3.15.4
     #   pytorch-triton-rocm
     #   torch
     #   transformers
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -50,7 +47,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.23.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tokenizers
@@ -91,7 +88,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -100,6 +97,10 @@ packaging==24.1
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   transformers
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/huggingface/requirements.in
 psutil==5.9.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -116,11 +117,11 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -130,7 +131,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-regex==2024.5.15
+regex==2024.7.24
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -143,7 +144,7 @@ rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -151,7 +152,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -159,16 +160,16 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/huggingface/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   transformers
-transformers==4.42.4
+transformers==4.44.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/huggingface/requirements.in
@@ -178,7 +179,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   reactivex
     #   torch
-urllib3==1.26.19
+urllib3==2.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
diff --git a/benchmarks/huggingface/requirements.xpu.txt b/benchmarks/huggingface/requirements.xpu.txt
index bcebec727..cb14810eb 100644
--- a/benchmarks/huggingface/requirements.xpu.txt
+++ b/benchmarks/huggingface/requirements.xpu.txt
@@ -4,10 +4,8 @@
 #
 #    pip-compile --output-file=benchmarks/huggingface/requirements.xpu.txt .pin/tmp-constraints-xpu-hf.txt benchmarks/huggingface/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,7 +15,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-certifi==2024.6.2
+certifi==2024.7.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   requests
@@ -49,7 +47,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.0
+huggingface-hub==0.24.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   tokenizers
@@ -90,7 +88,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -99,6 +97,10 @@ packaging==24.1
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   huggingface-hub
     #   transformers
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/huggingface/requirements.in
 psutil==5.9.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -115,7 +117,7 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   huggingface-hub
@@ -125,7 +127,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-regex==2024.5.15
+regex==2024.7.24
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   transformers
@@ -138,7 +140,7 @@ rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   transformers
@@ -146,7 +148,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
@@ -154,17 +156,18 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   transformers
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/huggingface/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   huggingface-hub
     #   transformers
-transformers==4.42.4
+transformers==4.44.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/huggingface/requirements.in
@@ -174,7 +177,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   reactivex
     #   torch
-urllib3==1.26.19
+urllib3==2.2.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   requests
@@ -185,5 +188,6 @@ varname==0.10.0
 voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/huggingface/requirements.in
diff --git a/benchmarks/huggingface/tunableop_results0.csv b/benchmarks/huggingface/tunableop_results0.csv
new file mode 100644
index 000000000..6a38d561a
--- /dev/null
+++ b/benchmarks/huggingface/tunableop_results0.csv
@@ -0,0 +1,17 @@
+Validator,PT_VERSION,2.4.0
+Validator,ROCBLAS_VERSION,4.0.0-88df9726-dirty
+Validator,HIPBLASLT_VERSION,0.6.0-592518e7
+Validator,ROCM_VERSION,6.0.0.0-91-08e5094
+Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
+GemmTunableOp_float_NT,nt_768_3072_16384,Gemm_Rocblas_69720,0.751226
+GemmTunableOp_float_NT,nt_3072_768_16384,Gemm_Rocblas_69733,0.684042
+GemmTunableOp_float_NT,nt_768_768_16384,Gemm_Hipblaslt_NT_28806,0.264226
+GemmTunableOp_float_NT,nt_768_30522_16384,Gemm_Hipblaslt_NT_27808,5.73919
+GemmTunableOp_float_NN,nn_768_16384_3072,Gemm_Hipblaslt_NN_33293,0.701076
+GemmTunableOp_float_NN,nn_768_16384_768,Gemm_Hipblaslt_NN_33685,0.209309
+GemmTunableOp_float_NN,nn_3072_16384_768,Gemm_Hipblaslt_NN_33225,0.69655
+GemmTunableOp_float_NN,nn_768_16384_30522,Gemm_Hipblaslt_NN_33924,5.81957
+GemmTunableOp_float_TN,tn_30522_16384_768,Default,6.06459
+GemmTunableOp_float_TN,tn_768_16384_3072,Gemm_Hipblaslt_TN_34830,0.584625
+GemmTunableOp_float_TN,tn_3072_16384_768,Gemm_Rocblas_69037,0.742789
+GemmTunableOp_float_TN,tn_768_16384_768,Gemm_Rocblas_69047,0.211827
diff --git a/benchmarks/huggingface/voirfile.py b/benchmarks/huggingface/voirfile.py
index 0ed042a80..b2e3ddd14 100644
--- a/benchmarks/huggingface/voirfile.py
+++ b/benchmarks/huggingface/voirfile.py
@@ -25,7 +25,7 @@ class Config:
     stop: int = 20
 
     # Number of seconds between each gpu poll
-    gpu_poll: int = 3
+    gpu_poll: int = 1
 
 
 @configurable
diff --git a/benchmarks/lightning/requirements.cuda.txt b/benchmarks/lightning/requirements.cuda.txt
index 88c732317..80f6b7f42 100644
--- a/benchmarks/lightning/requirements.cuda.txt
+++ b/benchmarks/lightning/requirements.cuda.txt
@@ -9,11 +9,11 @@
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --trusted-host pypi.ngc.nvidia.com
 
-aiohappyeyeballs==2.3.4
+aiohappyeyeballs==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-aiohttp==3.10.0
+aiohttp==3.10.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   fsspec
@@ -33,7 +33,7 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -55,7 +55,7 @@ frozenlist==1.4.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
@@ -66,11 +66,11 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-idna==3.7
+idna==3.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   yarl
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchcompat
@@ -78,7 +78,7 @@ jinja2==3.1.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-lightning==2.3.3
+lightning==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/lightning/requirements.in
@@ -116,8 +116,6 @@ networkx==3.3
 numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   lightning
-    #   pytorch-lightning
     #   torchmetrics
     #   torchvision
 nvidia-cublas-cu12==12.1.3.1
@@ -159,6 +157,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -176,7 +178,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -203,15 +205,11 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pytorch-lightning==2.3.3
+pytorch-lightning==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
@@ -221,7 +219,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -229,7 +227,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -246,7 +244,7 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/lightning/requirements.in
-torchmetrics==1.4.0.post0
+torchmetrics==1.4.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
@@ -255,7 +253,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/lightning/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
diff --git a/benchmarks/lightning/requirements.rocm.txt b/benchmarks/lightning/requirements.rocm.txt
new file mode 100644
index 000000000..26fdcedfa
--- /dev/null
+++ b/benchmarks/lightning/requirements.rocm.txt
@@ -0,0 +1,233 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/lightning/requirements.rocm.txt .pin/tmp-constraints-rocm-lightning-gpus.txt benchmarks/lightning/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+    #   torch
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   pytorch-lightning
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   yarl
+importlib-resources==6.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+lightning==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+lightning-utilities==0.11.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   yarl
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchmetrics
+    #   torchvision
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   torchmetrics
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pytorch-lightning==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   omegaconf
+    #   pytorch-lightning
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+    #   torchvision
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/lightning/requirements.in
+torchmetrics==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   pytorch-lightning
+torchvision==0.19.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   pytorch-lightning
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   lightning
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   reactivex
+    #   torch
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/lightning/requirements.in
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/lightning/requirements.xpu.txt b/benchmarks/lightning/requirements.xpu.txt
new file mode 100644
index 000000000..338ee0fb9
--- /dev/null
+++ b/benchmarks/lightning/requirements.xpu.txt
@@ -0,0 +1,235 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/lightning/requirements.xpu.txt .pin/tmp-constraints-xpu-lightning-gpus.txt benchmarks/lightning/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
+aiohappyeyeballs==2.3.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.5.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   ptera
+    #   voir
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   yarl
+importlib-resources==6.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+lightning==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+lightning-utilities==0.11.6
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+    #   yarl
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torchmetrics
+    #   torchvision
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   torchmetrics
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+pytorch-lightning==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   omegaconf
+    #   pytorch-lightning
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   voir
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   asttokens
+sympy==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   torch
+torch==2.4.0+cpu
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/lightning/requirements.in
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+    #   torchvision
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/lightning/requirements.in
+torchmetrics==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+torchvision==0.19.0+cpu
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/lightning/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   lightning
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   reactivex
+    #   torch
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
+    #   -c .pin/../constraints/xpu.txt
+    #   -r benchmarks/lightning/requirements.in
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/llama/main.py b/benchmarks/llama/main.py
index 696d16288..a17053296 100755
--- a/benchmarks/llama/main.py
+++ b/benchmarks/llama/main.py
@@ -83,6 +83,7 @@ def huggingface_main(args, model, config):
     # we just instantiate an untrained one
     println("Model")
     device = accelerator.fetch_device(0)
+    print(device)
 
     if args.pretrained:
         model = LlamaForCausalLM.from_pretrained(config["_name_or_path"]).to(device=device)
@@ -93,7 +94,7 @@ def huggingface_main(args, model, config):
     pipeline = transformers.pipeline(
         "text-generation",
         model=model,
-        torch_dtype=torch.float16,
+        torch_dtype=torch.bfloat16,
         # device_map="cuda",
         tokenizer=tokenizer,
         device=device,
diff --git a/benchmarks/llama/requirements.cuda.txt b/benchmarks/llama/requirements.cuda.txt
index 98e990076..a26e5bec4 100644
--- a/benchmarks/llama/requirements.cuda.txt
+++ b/benchmarks/llama/requirements.cuda.txt
@@ -9,11 +9,11 @@
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --trusted-host pypi.ngc.nvidia.com
 
-aiohappyeyeballs==2.3.4
+aiohappyeyeballs==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-aiohttp==3.10.0
+aiohttp==3.10.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -34,7 +34,7 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -50,7 +50,7 @@ codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.20.0
+datasets==2.21.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -84,7 +84,7 @@ frozenlist==1.4.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -95,13 +95,13 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -186,6 +186,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -203,7 +207,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -229,18 +233,10 @@ pyarrow==17.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-pyarrow-hotfix==0.6
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   datasets
 pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -249,7 +245,7 @@ pytz==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -270,11 +266,11 @@ requests==2.32.3
     #   datasets
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   transformers
@@ -288,7 +284,7 @@ six==1.16.0
     #   asttokens
     #   fire
     #   python-dateutil
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -305,13 +301,13 @@ torch==2.4.0+cu121
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llama/requirements.in
     #   fairscale
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.43.3
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -342,7 +338,7 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/llama/requirements.in
-xxhash==3.4.1
+xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
diff --git a/benchmarks/llama/requirements.rocm.txt b/benchmarks/llama/requirements.rocm.txt
index 25293406d..97c44bb0c 100644
--- a/benchmarks/llama/requirements.rocm.txt
+++ b/benchmarks/llama/requirements.rocm.txt
@@ -4,12 +4,13 @@
 #
 #    pip-compile --output-file=benchmarks/llama/requirements.rocm.txt .pin/tmp-constraints-rocm-llm.txt benchmarks/llama/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
-aiohttp==3.9.5
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+aiohttp==3.10.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -30,11 +31,11 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-certifi==2024.6.2
+certifi==2024.7.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -46,7 +47,7 @@ codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-datasets==2.20.0
+datasets==2.21.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -80,7 +81,7 @@ frozenlist==1.4.1
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -91,7 +92,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.23.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -147,7 +148,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -173,10 +174,6 @@ pyarrow==17.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
-pyarrow-hotfix==0.6
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   datasets
 pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -189,7 +186,7 @@ python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -197,7 +194,7 @@ pytz==2024.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -208,7 +205,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-regex==2024.5.15
+regex==2024.7.24
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -222,7 +219,7 @@ rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -236,7 +233,7 @@ six==1.16.0
     #   asttokens
     #   fire
     #   python-dateutil
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -248,18 +245,18 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
     #   fairscale
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.42.4
+transformers==4.44.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -273,7 +270,7 @@ tzdata==2024.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-urllib3==1.26.19
+urllib3==2.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -286,7 +283,7 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/llama/requirements.in
-xxhash==3.4.1
+xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
diff --git a/benchmarks/llama/requirements.xpu.txt b/benchmarks/llama/requirements.xpu.txt
index 9ef2b6f8d..b852606fd 100644
--- a/benchmarks/llama/requirements.xpu.txt
+++ b/benchmarks/llama/requirements.xpu.txt
@@ -4,12 +4,14 @@
 #
 #    pip-compile --output-file=benchmarks/llama/requirements.xpu.txt .pin/tmp-constraints-xpu-llm.txt benchmarks/llama/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
-aiohttp==3.9.5
+aiohappyeyeballs==2.3.5
+    # via
+    #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   datasets
@@ -30,11 +32,11 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   aiohttp
-certifi==2024.6.2
+certifi==2024.7.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   requests
@@ -90,7 +92,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.0
+huggingface-hub==0.24.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   datasets
@@ -146,7 +148,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -192,7 +194,7 @@ pytz==2024.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   datasets
@@ -203,7 +205,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-regex==2024.5.15
+regex==2024.7.24
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   transformers
@@ -217,7 +219,7 @@ rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   transformers
@@ -231,7 +233,7 @@ six==1.16.0
     #   asttokens
     #   fire
     #   python-dateutil
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
@@ -243,19 +245,20 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   transformers
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/llama/requirements.in
     #   fairscale
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.42.4
+transformers==4.44.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -269,7 +272,7 @@ tzdata==2024.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   pandas
-urllib3==1.26.19
+urllib3==2.2.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   requests
@@ -280,6 +283,7 @@ varname==0.10.0
 voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/llama/requirements.in
 xxhash==3.4.1
diff --git a/benchmarks/llm/benchfile.py b/benchmarks/llm/benchfile.py
index 1537ad556..6f8cadeee 100644
--- a/benchmarks/llm/benchfile.py
+++ b/benchmarks/llm/benchfile.py
@@ -1,7 +1,7 @@
 from milabench.pack import Package
 
 
-from milabench.commands import TorchrunAllGPU
+from milabench.commands import TorchrunAllGPU, TorchrunAllNodes, ForeachNode
 from milabench.pack import BasePackage
 from milabench.commands import SimpleCommand
 
@@ -15,7 +15,18 @@ def executable(self):
     #    return True
 
     def __init__(self, pack: BasePackage, *torchrun_args, **kwargs):
-        super().__init__(pack, *torchrun_args, module=False, **kwargs)
+        super().__init__(pack, "run", *torchrun_args, module=False, **kwargs)
+
+
+class TorchtuneAllNodes(TorchrunAllNodes):
+    def __init__(self, executor, *args, **kwargs) -> None:
+        base_exec = TorchrunAllNodes.make_base_executor(
+            Torchtune, 
+            executor,
+            *args, 
+            **kwargs
+        )
+        ForeachNode.__init__(self, base_exec)
 
 
 class Llm(Package):
@@ -31,7 +42,7 @@ async def install(self):
 
     def build_run_plan(self):
         exec = SimpleCommand(self)
-        return Torchtune(exec, "run").use_stdout()
+        return TorchtuneAllNodes(exec).use_stdout()
 
 
 __pack__ = Llm
diff --git a/benchmarks/llm/dev.yaml b/benchmarks/llm/dev.yaml
index 44386f209..e965769b1 100644
--- a/benchmarks/llm/dev.yaml
+++ b/benchmarks/llm/dev.yaml
@@ -13,6 +13,27 @@ _llm:
     method: per_gpu
 
 
+llm-rlhf-single:
+  inherits: _llm
+  definition: .
+  install-variant: unpinned
+  plan:
+    method: per_gpu
+
+  argv:
+    "{milabench_code}/recipes/lora_finetune_single_device.py": true
+    --config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml"
+    epochs=1: true
+    output_dir={milabench_extra}/output: true
+    tokenizer.path={milabench_data}/llama3_8B/original/tokenizer.model: true
+    checkpointer.checkpoint_dir={milabench_data}/llama3_8B/original: true
+    checkpointer.output_dir={milabench_data}/llama3_8B/: true
+    metric_logger.log_dir={milabench_extra}/metrics: true
+    repo_id="meta-llama/Meta-Llama-3.1-8B": true
+    batch_size=8: true
+    gradient_accumulation_steps=8: true
+
+  
 llm-lora-single:
   inherits: _llm
   definition: .
diff --git a/benchmarks/llm/recipes/ppo_full_finetune_single_device.py b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py
new file mode 100644
index 000000000..8ee77c06a
--- /dev/null
+++ b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py
@@ -0,0 +1,1084 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import os
+import sys
+from functools import partial
+from itertools import chain
+from typing import Any, Dict, List, Optional, Tuple
+from warnings import warn
+
+import torch
+from omegaconf import DictConfig, ListConfig
+from torch import nn
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader, DistributedSampler
+from torchtune import config, modules, utils
+from torchtune.datasets import ConcatDataset
+from torchtune.modules import rlhf
+from torchtune.modules.rlhf import PPOStats, Trajectory
+from torchtune.recipe_interfaces import FTRecipeInterface
+from tqdm import tqdm
+
+
+log = utils.get_logger("DEBUG")
+
+
+class PPOFullFinetuneRecipeSingleDevice(FTRecipeInterface):
+    """
+    Full finetuning recipe for RLHF with PPO for dense transformer-based LLMs such as LLama2. This recipe is optimized
+    for single GPU training. Training on CPU is not supported.
+
+    This implementation is based on `Learning to summarize from human feedback <https://arxiv.org/abs/2009.01325`_ and
+    `Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback <https://arxiv.org/abs/2204.05862`_.
+
+    Features:
+        - Activation Checkpointing. This can be controlled using the ``activation_checkpointing``
+            flag. Activation checkpointing helps reduce the memory footprint since we no longer keep
+            activations in memory and instead recompute them during the backward pass. This is especially
+            helpful for larger batch sizes when you're memory constrained. But these savings in memory
+            come at the cost of training performance. In most cases training can slow-down quite a bit as
+            a result of this activation recomputation.
+
+        - Precision. Full fp32 and bf16 training are supported. Precision is controlled using the ``dtype``
+            flag. When ``dtype=bf16``, all activations, gradients and optimizer states are in bfloat16. In
+            most cases this should halve the memory footprint of full precision (fp32) training, without
+            loss in model quality (will depend on the model, training data and other settings). For
+            GPUs which do not support bfloat16, we fall back to fp32. Mixed precision training and fp16
+            precision are currently not supported.
+
+        - Adjusting batch sizes when memory constrained. This recipe uses three different batch sizes:
+            - ``batch_size`` controls the total number of samples which are sampled from the dataset for a single trajectory.
+            - ``forward_batch_size`` controls the mini-batch size for trajectory generation. Since gradients are disabled
+                during trajectory generation, memory consumption is lower and this can be higher than ``ppo_batch_size``.
+            - ``ppo_batch_size`` controls the number of samples used for a single optimization step during PPO optimization.
+                Since we're optimizing two models at once, adjusting this parameter can have a big impact during training.
+
+        - Gradient Accumulation. You can simulate larger ``ppo_batch_size`` sizes by accumulating gradients. This is
+            controlled using the ``gradient_accumulation_steps`` flag.
+
+            For example: with ``ppo_batch_size``=32 and ``gradient_accumulation_steps``=16, each backward pass during
+            PPO optimization uses a 'micro batch size' of 2.
+
+            Gradient accumulation is especially useful when you are memory constrained. In this case,
+            accumulating gradients might give you better training speed than enabling activation
+            checkpointing.
+
+        - Optimizer in Backward. Fusing the optimizer step into the backward pass helps reduce the memory
+            footprint associated with gradients. This can be especially helpful when you are memory
+            constrained. Note that users can only use ONE of gradient accumulation or optimizer in backward.
+            These features currently do not work together. For more details on optimizer in backward, please
+            see this tutorial: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+
+            This paramater can provide significant performance gains, since there the number of optimization steps
+            scales with ``ppo_epochs`` and ``batch_size``. Depending on the maximum sequence length sampled from the dataset,
+            we've found that setting ``ppo_batch_size`` to the highest you can fit in memory, and `optimizer_in_bwd=True` to
+            provide significant memory savings.
+
+        - Lower precision optimizers. This recipe supports lower-precision optimizers from the bitsandbytes
+            library (https://huggingface.co/docs/bitsandbytes/main/en/index). We've tested the recipe with
+            8-bit AdamW and Paged AdamW. These optimizers are especially helpful when you are memory constrained
+            since they help reduce the memory footprint associated with the optimizer states.
+
+        - Checkpointing. Model weights are checkpointed both at the end of each epoch, and at the end of
+            training. Optimizer State and recipe state (seed, total_epochs, number of epochs run etc) are
+            only saved at the end of a given epoch and used in case of resuming training.
+
+            Resuming training is controlled by the ``resume_from_checkpoint`` flag. Mid-epoch checkpointing is
+            currently not supported.
+
+            For more details on the checkpointer, please take a look at
+            our checkpointer deepdive (https://pytorch.org/torchtune/main/deep_dives/checkpointer.html).
+
+        - Logging. Terminal, Disk, WandB and TensorBoard are all supported.
+
+    Args:
+        cfg (DictConfig): OmegaConf object parsed from yaml file
+
+    Raises:
+        RuntimeError: If ``dtype`` is set to fp16.
+    """
+
+    def __init__(self, cfg: DictConfig) -> None:
+
+        self._device = utils.get_device(device=cfg.device)
+        self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
+
+        # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor
+        # enabled necessary features such as gradient scaling.
+        if self._dtype == torch.float16:
+            raise RuntimeError(
+                "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
+            )
+
+        # logging attributes
+        self._output_dir = cfg.output_dir
+        self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
+        self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
+
+        # These are public properties which are updated by the checkpoint loader
+        # when ``resume_from_checkpoint`` is `True` or validated in tests
+        self.seed = utils.set_seed(seed=cfg.seed)
+        # manually setting up a generator for the recipe
+        self._rng = torch.Generator(self._device).manual_seed(self.seed)
+        self._total_steps = 0
+        self._steps_run = 0
+        self._total_epochs = 0
+        self._epochs_run = 0
+        self.global_step = 0
+
+        # Training cfg
+        self._resume_from_checkpoint = cfg.resume_from_checkpoint
+        self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
+
+    def setup(self, cfg: DictConfig) -> None:
+        """
+        Sets up the recipe state correctly. This includes setting recipe attributes based
+        on the ``resume_from_checkpoint`` flag.
+        """
+        self._metric_logger = config.instantiate(cfg.metric_logger)
+
+        # log config with parameter override
+        self._metric_logger.log_config(cfg)
+
+        # setup checkpointers
+        (
+            self._policy_checkpointer,
+            ref_policy_checkpointer,
+            self._value_checkpointer,
+            reward_checkpointer,
+        ) = self._setup_checkpointers(
+            cfg.checkpointer,
+            cfg.ref_policy_checkpointer,
+            cfg.value_checkpointer,
+            cfg.reward_checkpointer,
+        )
+
+        # load policy checkpoints
+        policy_model_checkpoint_dict = self._policy_checkpointer.load_checkpoint()
+        ref_policy_state_dict = ref_policy_checkpointer.load_checkpoint()
+
+        # load reward and value model checkpoints
+        value_model_checkpoint_dict = self._value_checkpointer.load_checkpoint()
+        reward_model_state_dict = reward_checkpointer.load_checkpoint()
+
+        # update recipe state
+        # ``_setup_model`` handles initialization and loading the state dict. This method
+        # should be called before ``_setup_optimizer`` since transforming the optimizer
+        # state dict requires the model
+        self._model_compile = cfg.compile
+        self._optimizer_in_bwd = cfg.optimizer_in_bwd
+        (
+            self._policy_model,
+            self._value_model,
+            self._reward_model,
+            self._ref_policy_model,
+        ) = self._setup_model(
+            cfg_model=cfg.policy_model,
+            cfg_reward_value_model=cfg.reward_and_value_model,
+            enable_activation_checkpointing=cfg.enable_activation_checkpointing,
+            compile_model=self._model_compile,
+            policy_state_dict=policy_model_checkpoint_dict[utils.MODEL_KEY],
+            ref_policy_state_dict=ref_policy_state_dict[utils.MODEL_KEY],
+            value_model_state_dict=value_model_checkpoint_dict[utils.MODEL_KEY],
+            reward_model_state_dict=reward_model_state_dict[utils.MODEL_KEY],
+        )
+
+        # setup tokenizer
+        self._tokenizer = config.instantiate(cfg.tokenizer)
+        log.info("Tokenizer is initialized from file.")
+
+        # _setup_optimizer should take in ckpt_dict only if training is resumed from
+        # checkpoint. Transforming the opt state dict is handled by this method
+        self._optimizer = self._setup_optimizer(
+            cfg_optimizer=cfg.optimizer,
+            optimizer_in_bwd=cfg.optimizer_in_bwd,
+            opt_state_dict=(
+                policy_model_checkpoint_dict[utils.OPT_KEY]
+                if self._resume_from_checkpoint
+                else None
+            ),
+        )
+
+        self._loss_fn = config.instantiate(cfg.loss)
+        log.info("Loss is initialized.")
+
+        # sampler and dataloader depends on the tokenizer and should be set
+        # setup afterit is initialized
+        self._sampler, self._dataloader = self._setup_data(
+            cfg_dataset=cfg.dataset,
+            shuffle=cfg.shuffle,
+            batch_size=cfg.batch_size,
+        )
+
+        self._setup_training_parameters(cfg)
+        self._setup_training_hyperparameters(cfg)
+
+        if self._resume_from_checkpoint:
+            self._update_recipe_state(policy_model_checkpoint_dict)
+
+        # one "step" is a single gradient update update over a minibatch of trajectories
+        self.global_step = (
+            self._steps_run
+            * self._ppo_epochs
+            * (self.batch_size // self._ppo_batch_size)
+        )
+
+    def _setup_training_hyperparameters(self, cfg) -> None:
+        """
+        Sets up the training hyperparameters for the recipe. This includes the GAE hyperparameters,
+        generation hyperparameters, reward masking hyperparameters, and stop token ids.
+        """
+
+        self._kl_coeff = cfg.kl_coeff
+        # GAE hyperparameters
+        self._gamma = cfg.gamma
+        self._lmbda = cfg.lmbda
+        self._whiten_rewards = cfg.whiten_rewards
+
+        # trajectory generation args
+        self._temperature = cfg.temperature
+        self._top_k = cfg.top_k
+        self._max_generated_tokens = cfg.max_generated_tokens
+
+        # reward masking args
+        self._min_response_length = cfg.min_response_length
+        self._penalise_no_eos = cfg.penalise_no_eos
+        self._reward_penalty = cfg.reward_penalty
+
+        # lots of hand holding for stop tokens
+        if cfg.get("stop_token_ids", False):
+            stop_token_ids = cfg.stop_token_ids
+            if self._tokenizer.eos_id not in stop_token_ids:
+                warn(
+                    f"tokenizer eos_id ({self._tokenizer.eos_id}) is not in stop_token_ids ({stop_token_ids})."
+                    "This may lead to unexpected behaviour."
+                )
+        else:
+            if not hasattr(self._tokenizer.stop_tokens):
+                warn(
+                    "No stop tokens defined in tokenizer, and no stop_token_ids provided. This may lead to unexpected behaviour."
+                )
+                stop_token_ids = []
+            else:
+                stop_token_ids = self._tokenizer.stop_tokens
+        self._stop_token_ids = torch.tensor(stop_token_ids, device=self._device)
+
+    def _setup_training_parameters(self, cfg: DictConfig) -> None:
+        """
+        Validates and sets up parameters for used during training and for tracking training state,
+        batch sizes for model forward passes during trajectory generation, PPO minibatches, and
+        PPO microbatches for gradient accumulation.
+
+        Raises
+            - ValueError if:
+                - batch_size is not divisible by forward_batch_size
+                - batch_size is not divisible by ppo_batch_size
+                - ppo_batch_size is not divisible by gradient_accumulation_steps
+                - num_steps is less than batch_size
+                - gradient_accumulation_steps > 1 and optimizer_in_bwd is True
+        """
+        self.batch_size = cfg.batch_size
+        self._forward_batch_size = cfg.forward_batch_size
+        self._ppo_epochs = cfg.ppo_epochs
+        self._ppo_batch_size = cfg.ppo_batch_size
+        self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
+        self._ppo_backward_batch_size = (
+            cfg.ppo_batch_size // self._gradient_accumulation_steps
+        )
+
+        if self.batch_size % self._forward_batch_size != 0:
+            raise ValueError(
+                f"batch_size ({self.batch_size}) must be exactly divisible by "
+                f"forward_batch_size ({self._forward_batch_size})."
+            )
+        if self.batch_size % self._ppo_batch_size != 0:
+            raise ValueError(
+                f"batch_size ({self.batch_size}) must be exactly divisible by "
+                f"ppo_batch_size ({self._ppo_batch_size})."
+            )
+        if self._ppo_batch_size % self._gradient_accumulation_steps != 0:
+            raise ValueError(
+                f"ppo_batch_size ({self._ppo_batch_size}) must be exactly divisible "
+                f"by gradient_accumulation_steps ({self._gradient_accumulation_steps})."
+            )
+
+        if self._gradient_accumulation_steps > 1 and self._optimizer_in_bwd:
+            raise RuntimeError(
+                "Gradient accumulation is not supported with optimizer in bwd."
+                "Please set gradient_accumulation_steps=1, or optimizer_in_bwd=False."
+            )
+
+        self._total_steps = cfg.num_steps // self.batch_size
+        batches_per_epoch = max(
+            1, len(self._dataloader)
+        )  # when we only have a single batch in the dataset
+
+        self._total_epochs = math.ceil(self._total_steps / batches_per_epoch)
+        if self._total_steps == 0:
+            raise ValueError(
+                f"num_steps {cfg.num_steps} must be greater than the batch size {self.batch_size}."
+            )
+        if self._total_steps < len(self._dataloader):
+            warn(
+                f"There are fewer total steps ({self._total_steps}, (num_steps//batch_size) "
+                f"than there are batches ({len(self._dataloader)}) in the dataset. "
+                f"Training will stop after ({self._total_steps}) steps without saving intermediate checkpoints"
+            )
+        if (self._total_steps > batches_per_epoch) and (
+            self._total_steps % batches_per_epoch != 0
+        ):
+            warn(
+                f"num_steps ({cfg.num_steps}) is not exactly divisible by "
+                f"the number of batches in the dataset ({batches_per_epoch}). "
+                f"Intermediate checkpoints will only be saved every {batches_per_epoch} steps."
+            )
+        log.info(
+            f"Total steps to run: {self._total_steps}, Total epochs to run: {self._total_epochs}"
+        )
+
+    def _setup_checkpointers(
+        self,
+        policy_cfg: DictConfig,
+        ref_policy_cfg: DictConfig,
+        value_cfg: DictConfig,
+        reward_cfg: DictConfig,
+    ) -> Tuple[
+        utils.Checkpointer, utils.Checkpointer, utils.Checkpointer, utils.Checkpointer
+    ]:
+        """
+        Sets up checkpointers for policy, reference policy, value, and reward models.
+        Only the policy checkpoint handles recipe state for resuming from checkpoints.
+        """
+
+        if not self._resume_from_checkpoint:
+            assert policy_cfg.checkpoint_dir == ref_policy_cfg.checkpoint_dir, (
+                "Policy and reference policy should be loaded from the same checkpoint directories"
+                f"at the start of training. Found: {policy_cfg.checkpoint_dir} and"
+                f"{ref_policy_cfg.checkpoint_dir}"
+            )
+            assert policy_cfg.checkpoint_files == ref_policy_cfg.checkpoint_files, (
+                "Policy and reference policy should be loaded from the same checkpoint files"
+                f"at the start of training. Found: {policy_cfg.checkpoint_files} and"
+                f"{ref_policy_cfg.checkpoint_files}"
+            )
+
+        policy_checkpointer = config.instantiate(
+            policy_cfg,
+            resume_from_checkpoint=self._resume_from_checkpoint,
+        )
+
+        ref_policy_checkpointer = config.instantiate(
+            ref_policy_cfg,
+            resume_from_checkpoint=False,
+        )
+
+        value_checkpointer = config.instantiate(
+            value_cfg,
+            resume_from_checkpoint=False,
+        )
+
+        reward_checkpointer = config.instantiate(
+            reward_cfg,
+            resume_from_checkpoint=False,
+        )
+
+        return (
+            policy_checkpointer,
+            ref_policy_checkpointer,
+            value_checkpointer,
+            reward_checkpointer,
+        )
+
+    def _setup_model(
+        self,
+        cfg_model: DictConfig,
+        cfg_reward_value_model: DictConfig,
+        enable_activation_checkpointing: bool,
+        compile_model: bool,
+        policy_state_dict: Dict[str, Any],
+        ref_policy_state_dict: Dict[str, Any],
+        value_model_state_dict: Dict[str, Any],
+        reward_model_state_dict: Dict[str, Any],
+    ) -> Tuple[nn.Module, nn.Module, nn.Module]:
+        """
+        Sets up the policy model, reference policy model, reward model, and value model.
+        """
+
+        with utils.set_default_dtype(self._dtype), self._device:
+            policy_model = config.instantiate(cfg_model)
+            ref_policy_model = config.instantiate(cfg_model)
+            reward_model = config.instantiate(cfg_reward_value_model)
+            value_model = config.instantiate(cfg_reward_value_model)
+
+        if enable_activation_checkpointing:
+            utils.set_activation_checkpointing(
+                policy_model, auto_wrap_policy={modules.TransformerDecoderLayer}
+            )
+            utils.set_activation_checkpointing(
+                value_model, auto_wrap_policy={modules.TransformerDecoderLayer}
+            )
+
+        policy_model.load_state_dict(policy_state_dict)
+        ref_policy_model.load_state_dict(ref_policy_state_dict)
+
+        reward_missing, reward_unexpected = reward_model.load_state_dict(
+            reward_model_state_dict, strict=False
+        )
+        value_missing, value_unexpected = value_model.load_state_dict(
+            value_model_state_dict, strict=False
+        )
+
+        # some extra validation for HF classifier checkpoints with a `score.bias` present
+        assert (
+            reward_missing == value_missing == []
+        ), f"Missing keys in reward ({reward_missing}) and value model ({value_missing}) state dicts."
+
+        if reward_unexpected or value_unexpected:
+            # the only unexpected keys should be when pre-trained HF models were saved with
+            # bias=True in final classification layers. This happens when training a reward model with TRL.
+            assert (
+                reward_unexpected == value_unexpected == ["output.bias"]
+            ), f"Unexpected keys in reward ({reward_unexpected}) and value model ({value_unexpected}) state dicts."
+
+        # Validate models were loaded in with the expected dtype.
+        utils.validate_expected_param_dtype(
+            value_model.named_parameters(), dtype=self._dtype
+        )
+        utils.validate_expected_param_dtype(
+            reward_model.named_parameters(), dtype=self._dtype
+        )
+        utils.validate_expected_param_dtype(
+            value_model.named_parameters(), dtype=self._dtype
+        )
+        utils.validate_expected_param_dtype(
+            ref_policy_model.named_parameters(), dtype=self._dtype
+        )
+
+        log.info(f"Models are initialized with precision {self._dtype}.")
+
+        # disabling dropout if found - non-determinism leads to issues in e.g. comparing logprobs
+        # between ref policy and current policy
+        for module in policy_model.modules():
+            if isinstance(module, torch.nn.Dropout):
+                warn(
+                    f"Dropout found in {module}. This is likely to cause issues during training. Disabling."
+                )
+                module.p = 0
+        for module in value_model.modules():
+            if isinstance(module, torch.nn.Dropout):
+                warn(
+                    f"Dropout found in {module}. This is likely to cause issues during training. Disabling."
+                )
+                module.p = 0
+
+        # disabling grad and dropout in reward and reference policy models
+        reward_model.eval()
+        ref_policy_model.eval()
+
+        for p in reward_model.parameters():
+            p.requires_grad = False
+
+        for p in ref_policy_model.parameters():
+            p.requires_grad = False
+
+        # Compile model, if enabled.
+        if compile_model:
+            backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
+            log.info("Compiling models with torch.compile...")
+
+            policy_model.compile(backend=backend)
+            reward_model.compile(backend=backend)
+            ref_policy_model.compile(backend=backend)
+            value_model.compile(backend=backend)
+
+        if self._device.type == "cuda":
+            memory_stats = utils.get_memory_stats(device=self._device)
+            utils.log_memory_stats(memory_stats)
+
+        return policy_model, value_model, reward_model, ref_policy_model
+
+    def _setup_optimizer(
+        self,
+        cfg_optimizer: DictConfig,
+        optimizer_in_bwd: bool = False,
+        opt_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> Optimizer:
+
+        if optimizer_in_bwd:
+            # Maintain a dict of optims for every parameter.
+            optim_dict = {
+                p: config.instantiate(cfg_optimizer, [p])
+                for p in chain(
+                    self._policy_model.parameters(), self._value_model.parameters()
+                )
+            }
+            # Register optimizer step hooks on the models to run optimizer in backward.
+            utils.register_optim_in_bwd_hooks(
+                model=self._policy_model, optim_dict=optim_dict
+            )
+            utils.register_optim_in_bwd_hooks(
+                model=self._value_model, optim_dict=optim_dict
+            )
+            # Create a wrapper for checkpoint save/load of optimizer states when running in backward.
+            self._optim_ckpt_wrapper = utils.create_optim_in_bwd_wrapper(
+                model=self._policy_model, optim_dict=optim_dict
+            )
+            self._optim_ckpt_wrapper = utils.create_optim_in_bwd_wrapper(
+                model=self._value_model, optim_dict=optim_dict
+            )
+            # Load optimizer states. If optimizer states are being restored in an optimizer in backward
+            # run, these need to have been saved with the same setting. Cannot restore from runs that did not
+            # use optimizer in backward.
+            if opt_state_dict is not None:
+                try:
+                    self._optim_ckpt_wrapper.load_state_dict(opt_state_dict)
+                except BaseException as e:
+                    raise RuntimeError(
+                        "Failed loading in-backward optimizer checkpoints."
+                        "Please make sure run being restored from was using in-backward optimizer."
+                    ) from e
+            log.info("In-backward optimizers are set up.")
+            return None
+        else:
+            optimizer = config.instantiate(
+                cfg_optimizer,
+                chain(self._policy_model.parameters(), self._value_model.parameters()),
+            )
+            if opt_state_dict:
+                optimizer.load_state_dict(opt_state_dict)
+
+            log.info("Optimizer is initialized.")
+            return optimizer
+
+    def _setup_data(
+        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
+    ) -> Tuple[DistributedSampler, DataLoader]:
+        """
+        All data related setup happens here.
+        """
+        if isinstance(cfg_dataset, ListConfig):
+            datasets = [
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
+                for single_cfg_dataset in cfg_dataset
+            ]
+            ds = ConcatDataset(datasets=datasets)
+        else:
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
+
+        sampler = DistributedSampler(
+            ds,
+            num_replicas=1,
+            rank=0,
+            shuffle=shuffle,
+            seed=0,
+        )
+        dataloader = DataLoader(
+            dataset=ds,
+            sampler=sampler,
+            batch_size=batch_size,
+            collate_fn=partial(
+                rlhf.left_padded_collate,
+                padding_idx=self._tokenizer.pad_id,
+            ),
+            drop_last=True,
+        )
+
+        return sampler, dataloader
+
+    def save_checkpoint(
+        self, epoch: int, is_intermediate_checkpoint: bool = False
+    ) -> None:
+        """
+        Save state dict to file. The recipe save_checkpoint method is responsible for
+        correctly creating the checkpoint dict and passing to the checkpointer.
+        """
+        policy_ckpt_dict = {utils.MODEL_KEY: self._policy_model.state_dict()}
+        value_ckpt_dict = {utils.MODEL_KEY: self._value_model.state_dict()}
+
+        # if training is in-progress, checkpoint the optimizer state and rng state as well
+        if is_intermediate_checkpoint:
+            policy_ckpt_dict.update(
+                {
+                    utils.SEED_KEY: self.seed,
+                    utils.EPOCHS_KEY: self._epochs_run,
+                    utils.TOTAL_EPOCHS_KEY: self._total_epochs,
+                    utils.MAX_STEPS_KEY: self._total_steps,
+                    utils.STEPS_KEY: self._steps_run,
+                    utils.RNG_KEY: self._rng.get_state(),
+                }
+            )
+            if not self._optimizer_in_bwd:
+                policy_ckpt_dict[utils.OPT_KEY] = self._optimizer.state_dict()
+            else:
+                policy_ckpt_dict[utils.OPT_KEY] = self._optim_ckpt_wrapper.state_dict()
+
+        self._policy_checkpointer.save_checkpoint(
+            policy_ckpt_dict,
+            epoch=epoch,
+            intermediate_checkpoint=is_intermediate_checkpoint,
+        )
+
+        self._value_checkpointer.save_checkpoint(
+            value_ckpt_dict,
+            epoch=epoch,
+            intermediate_checkpoint=False,
+        )
+
+    def _update_recipe_state(self, ckpt_dict: Dict[str, Any]) -> None:
+        """
+        Updates the recipe state from checkpoint.
+        """
+        # If seed or total_steps, or total_epochs don't match,
+        # warn the user and overwrite.
+        try:
+            if (
+                self.seed != ckpt_dict[utils.SEED_KEY]
+                or self._total_steps != ckpt_dict[utils.MAX_STEPS_KEY]
+                or self._total_epochs != ckpt_dict[utils.TOTAL_EPOCHS_KEY]
+            ):
+                warn(
+                    message="""Configured value for seed, total_steps, or total_epochs
+                    does not match the value stored in checkpoint."""
+                )
+            self.seed = utils.set_seed(seed=ckpt_dict[utils.SEED_KEY])
+            self._rng.set_state(ckpt_dict[utils.RNG_KEY])
+            self._steps_run = ckpt_dict[utils.STEPS_KEY]
+            self._total_steps = ckpt_dict[utils.MAX_STEPS_KEY]
+            self._total_epochs = ckpt_dict[utils.TOTAL_EPOCHS_KEY]
+            self._epochs_run = ckpt_dict[utils.EPOCHS_KEY]
+
+        except KeyError as e:
+            raise KeyError from e(
+                "Checkpoint does not contain the required keys needed for updating recipe state."
+                "Are you sure you passed in the right recipe checkpoint?"
+            )
+
+    def generate_trajectory(self, input_ids: torch.Tensor) -> Trajectory:
+        """
+        Generates a trajectory given the current policy and value models, the reference policy model, the reward model,
+        and batch of inputs. This is done over the following steps:
+
+        1: Generate responses, and logits corresponding to the responses using the current policy,
+            generating (query, response) pairs.
+        2. Estimate logprobs of the generated responses using the current policy.
+        3. Estimate values from the generated responses using the current value function.
+        4. Replace any tokens in the response after the first stop token (usually EOS token) with padding,
+            producting truncated responses.
+        5. Run the reward model on the (query, truncated-response) pairs.
+        6. Mask out all the invalid values in the trajectory due to padding tokens.
+
+        Args:
+            input_ids (torch.Tensor): tensor of input token IDs with shape [b, seq_length]
+
+        Returns:
+            Trajectory: An instance of :class:`~torchtune.modules.rlhf.Trajectory` comprising
+                the current trajectory.
+        """
+        batch_size, context_length = input_ids.shape
+
+        # step 1: generate responses, and logits corresponding to the responses using the current policy
+        query_responses, logits = rlhf.generate_with_logits(
+            model=self._policy_model,
+            prompt=input_ids,
+            max_generated_tokens=self._max_generated_tokens,
+            temperature=self._temperature,
+            top_k=self._top_k,
+            pad_id=self._tokenizer.pad_id,
+            rng=self._rng,
+        )
+
+        responses = query_responses[:, context_length:].clone()
+        query_response_padding_masks = query_responses == self._tokenizer.pad_id
+
+        # step 1.1 create attention masks and position IDs for any padding tokens in inputs, used for future forward passes
+        masks = rlhf.get_causal_mask(~(query_response_padding_masks))
+        position_ids = (~query_response_padding_masks).cumsum(-1) - (
+            ~query_response_padding_masks
+        ).long()
+        position_ids = position_ids.type(torch.int)
+
+        del query_response_padding_masks
+
+        # step 2. estimate logprobs of the responses using the current policy
+        logits = logits[:, context_length - 1 :]
+        logprobs = rlhf.logits_to_logprobs(logits, responses, self._temperature)
+
+        del logits
+
+        # step 2.1 estimate logprobs of the responses using the reference policy
+        ref_logits = self._ref_policy_model(
+            query_responses, input_pos=position_ids, mask=masks
+        )
+        ref_logits = rlhf.truncate_sequence_for_logprobs(ref_logits, context_length)
+        ref_logprobs = rlhf.logits_to_logprobs(ref_logits, responses, self._temperature)
+
+        del ref_logits
+
+        # step 3. estimate values from the responses using the value function
+        values = self._value_model(query_responses, input_pos=position_ids, mask=masks)
+        values = rlhf.truncate_sequence_for_logprobs(values, context_length).squeeze(-1)
+
+        # step 4. replace any tokens in the responses after the first stop token (usually EOS token) with padding
+        # resulting in truncated responses
+        response_padding_masks, responses = rlhf.truncate_sequence_at_first_stop_token(
+            responses, self._stop_token_ids, self._tokenizer.pad_id
+        )
+
+        # step 5. run the reward model on the (query, truncated-response) pairs
+        scores = self._reward_model(
+            torch.cat([input_ids, responses], dim=1),
+            input_pos=position_ids,
+            mask=masks,
+        )
+
+        del responses
+
+        # step 5.1 the scores from the reward model are the logits for the last non-padding token in
+        # each (query, truncated-response) pair
+        seq_lens = utils.get_unmasked_sequence_lengths(response_padding_masks)
+        scores = scores[torch.arange(batch_size), seq_lens + context_length].squeeze(-1)
+
+        # step 5.2 if configured, apply any penalties for sequences without EOS tokens
+        # or shorter than a certain length
+        if self._penalise_no_eos or self._min_response_length:
+            reward_penalty_mask = rlhf.get_reward_penalty_mask(
+                response_padding_masks,
+                seq_lens,
+                self._penalise_no_eos,
+                self._min_response_length,
+            )
+            scores[reward_penalty_mask] = self._reward_penalty
+
+        # step 6. mask out all the invalid values in the trajectory due to padding tokens
+        logprobs[response_padding_masks] = 1.0
+        ref_logprobs[response_padding_masks] = 1.0
+
+        # step 6.1 values are masked out *after* the last valid token in the response
+        value_seq_idxs = torch.where(
+            (seq_lens > 0) & (seq_lens < self._max_generated_tokens - 1),
+            seq_lens + 1,
+            seq_lens,
+        )
+        value_padding_masks = response_padding_masks.clone()
+        value_padding_masks[
+            torch.arange(batch_size, device=value_padding_masks.device),
+            value_seq_idxs,
+        ] = False
+
+        values[value_padding_masks] = 0.0
+
+        return Trajectory(
+            query_responses=query_responses,
+            logprobs=logprobs,
+            ref_logprobs=ref_logprobs,
+            values=values,
+            masks=masks,
+            position_ids=position_ids,
+            response_padding_masks=response_padding_masks,
+            value_padding_masks=value_padding_masks,
+            value_seq_idxs=value_seq_idxs,
+            scores=scores,
+            seq_lens=seq_lens,
+        )
+
+    def generate_trajectory_batched(self, input_ids: torch.Tensor) -> Trajectory:
+        """
+        Generates a ``self.batch_size`` batch of trajectories using `self._forward_batch_size` batch sizes.
+        See ``generate_trajectory`` for more details.
+
+        Args:
+            input_ids (torch.Tensor): tensor of input token IDs with shape [b, seq_length]
+
+        Returns:
+            Trajectory: An instance of :class:`~torchtune.modules.rlhf.Trajectory`, comprising
+                the current trajectory.
+        """
+        trajectories: List[Trajectory] = []
+        with torch.no_grad():
+            for batch_start in range(0, self.batch_size, self._forward_batch_size):
+                batch_input_ids = input_ids[
+                    batch_start : batch_start + self._forward_batch_size
+                ]
+                trajectories.append(self.generate_trajectory(batch_input_ids))
+        return Trajectory(*map(torch.cat, zip(*trajectories)))
+
+    def train(self) -> None:
+        """
+        The core training loop."""
+
+        if self._model_compile:
+            log.info(
+                "NOTE: torch.compile is enabled and model is compiled in first forward."
+                "Expect a relatively slow first iteration."
+            )
+        # zero out the gradients before starting training
+        if not self._optimizer_in_bwd:
+            self._optimizer.zero_grad()
+
+        training_completed = False
+        pbar = tqdm(total=self._total_steps, initial=self._steps_run)
+        for curr_epoch in range(self._epochs_run, self._total_epochs):
+            # Update the sampler to ensure data is correctly shuffled across epochs
+            # in case shuffle is True
+            self._sampler.set_epoch(curr_epoch)
+
+            for _, batch in enumerate(self._dataloader):
+                batch = batch.to(self._device)
+                _, context_length = batch.shape
+
+                # step 1. generate the trajectory using:
+                # - the current policy (pi_theta)
+                # - the current value function (V_phi)
+                # - the reference frozen policy model (pi_theta_0)
+                trajectory = self.generate_trajectory_batched(batch)
+
+                # step 2. get the rewards for the current trajectory. these are based on:
+                #   - the divergence between the current policy and the reference policy
+                #   - the scores from the reward model
+                rewards, kl, kl_rewards = rlhf.get_rewards_ppo(
+                    trajectory.scores,
+                    trajectory.logprobs,
+                    trajectory.ref_logprobs,
+                    self._kl_coeff,
+                    trajectory.value_seq_idxs,
+                )
+
+                # step 3. estimate the advantages using Generalized Advantage Estimation (GAE)
+                advantages, returns = rlhf.estimate_advantages(
+                    trajectory.values,
+                    rewards,
+                    self._gamma,
+                    self._lmbda,
+                    masks=~trajectory.response_padding_masks,
+                )
+
+                # step 4. optimise using the PPO objective over multiple epochs
+                ppo_stats: List[PPOStats] = []
+                for _ in range(self._ppo_epochs):
+                    batch_idxs = torch.randperm(self.batch_size, device=self._device)
+                    for i in range(0, self.batch_size, self._ppo_batch_size):
+                        mini_batch_idxs = batch_idxs[i : i + self._ppo_batch_size]
+
+                        batch_ppo_stats: List[PPOStats] = []
+                        for j in range(
+                            0, self._ppo_batch_size, self._ppo_backward_batch_size
+                        ):
+                            backward_batch_idxs = mini_batch_idxs[
+                                j : j + self._ppo_backward_batch_size
+                            ]
+
+                            batch_trajectory = Trajectory(
+                                *map(
+                                    partial(
+                                        torch.index_select,
+                                        dim=0,
+                                        index=backward_batch_idxs,
+                                    ),
+                                    trajectory,
+                                )
+                            )
+                            batch_ppo_stats.append(
+                                self._ppo_step(
+                                    batch_trajectory,
+                                    advantages[backward_batch_idxs],
+                                    returns[backward_batch_idxs],
+                                    context_length,
+                                )
+                            )
+                            del batch_trajectory
+
+                        ppo_stats.append(PPOStats(*map(sum, zip(*batch_ppo_stats))))
+
+                        if not self._optimizer_in_bwd:
+                            self._optimizer.step()
+                            self._optimizer.zero_grad(set_to_none=True)
+
+                        self.global_step += 1
+
+                # step 5. profit
+                self._steps_run += 1
+                if self._steps_run % self._log_every_n_steps == 0:
+                    self.log_metrics(
+                        trajectory,
+                        PPOStats(*map(torch.stack, zip(*ppo_stats))),
+                        kl,
+                        kl_rewards,
+                    )
+                self.cleanup_after_step(
+                    trajectory, ppo_stats, advantages, returns, kl, kl_rewards
+                )
+                pbar.update(1)
+                if self._steps_run == self._total_steps:
+                    training_completed = True
+                    break
+
+            # save checkpoint at current epoch
+            self._epochs_run += 1
+
+            self.save_checkpoint(
+                curr_epoch, is_intermediate_checkpoint=not training_completed
+            )
+            if training_completed:
+                return
+
+    def _ppo_step(
+        self,
+        trajectory: Trajectory,
+        advantages: torch.Tensor,
+        returns: torch.Tensor,
+        context_length: int,
+    ) -> PPOStats:
+        """
+        Perform a single PPO optimisation step over a batch of trajectories and corresponding advantages and returns.
+
+        Args:
+            trajectory (Trajectory): a batch of trajectories
+            advantages (torch.Tensor): advantages corresponding to the trajectories
+            returns (torch.Tensor): returns corresponding the trajectories
+            context_length (int): input ids sequence length
+
+        Returns:
+            PPOStats: An instance of :class:`~torchtune.modules.rlhf.PPOStats`, a NamedTuple containing:
+               - loss (torch.Tensor): The total PPO loss.
+               - policy_loss (torch.Tensor): The policy function loss.
+               - value_loss (torch.Tensor): The value function loss.
+               - ratios (torch.Tensor): The ratio between the current and old policy probabilities.
+               - clipfrac (torch.Tensor): The fraction of ratios that were clipped.
+               - approx_policy_kls: Average estimated KL divergence between the policy before and after the optimisation step.
+
+        """
+        # estimate logprobs from the policy at the current optimisation step
+        pi_logits = self._policy_model(
+            trajectory.query_responses,
+            input_pos=trajectory.position_ids,
+            mask=trajectory.masks,
+        )
+        pi_logits = rlhf.truncate_sequence_for_logprobs(pi_logits, context_length)
+        pi_logprobs = rlhf.logits_to_logprobs(
+            pi_logits, trajectory.query_responses[:, context_length:], self._temperature
+        )
+        pi_logprobs[trajectory.response_padding_masks] = 1.0
+
+        del pi_logits
+
+        # estimate the values from the value function at the current optimisation step
+        phi_values = self._value_model(
+            trajectory.query_responses,
+            input_pos=trajectory.position_ids,
+            mask=trajectory.masks,
+        )
+
+        phi_values = rlhf.truncate_sequence_for_logprobs(
+            phi_values, context_length
+        ).squeeze(-1)
+        phi_values[trajectory.value_padding_masks] = 0.0
+
+        # calculate ppo loss
+        loss, policy_loss, value_loss, ratios, clipfrac = self._loss_fn(
+            trajectory.logprobs,
+            pi_logprobs,
+            advantages,
+            trajectory.values,
+            phi_values,
+            returns,
+            padding_masks=~trajectory.response_padding_masks,
+            value_padding_masks=~trajectory.value_padding_masks,
+        )
+
+        loss /= self._gradient_accumulation_steps
+        loss.backward()
+
+        with torch.no_grad():
+            approx_policy_kls = (
+                0.5 * (pi_logprobs - trajectory.logprobs).pow(2)
+            ).mean()
+
+        return PPOStats(
+            loss,
+            policy_loss / self._gradient_accumulation_steps,
+            value_loss / self._gradient_accumulation_steps,
+            ratios / self._gradient_accumulation_steps,
+            clipfrac / self._gradient_accumulation_steps,
+            approx_policy_kls / self._gradient_accumulation_steps,
+        )
+
+    def log_metrics(
+        self,
+        trajectory: Trajectory,
+        ppo_stats: PPOStats,
+        kl: torch.Tensor,
+        kl_rewards: torch.Tensor,
+    ) -> None:
+        """
+        Log metrics and statistics for the current step to the metric logger.
+        """
+        log_dict = {
+            "scores": trajectory.scores.mean(),
+            "num_stop_tokens": trajectory.response_padding_masks.any(-1).sum(),
+            "rlhf_reward": trajectory.scores.mean() + kl_rewards.sum(1).mean(),
+            "kl": kl.sum(1).mean(),
+            "kl_reward": kl_rewards.sum(1).mean(),
+            "loss": ppo_stats.loss.mean(),
+            "policy_loss": ppo_stats.policy_loss.mean(),
+            "value_loss": ppo_stats.value_loss.mean(),
+            "clipfrac": ppo_stats.clipfrac.mean(),
+            "ratios": ppo_stats.ratios.mean(),
+            "approx_policy_kl": ppo_stats.approx_policy_kls.mean(),
+            "response_lengths": trajectory.seq_lens.float().mean(),
+        }
+        if self._device.type == "cuda" and self._log_peak_memory_stats:
+            log_dict.update(utils.get_memory_stats(device=self._device))
+
+        self._metric_logger.log_dict(log_dict, step=self.global_step)
+
+    def cleanup_after_step(
+        self,
+        trajectory: Trajectory,
+        ppo_stats: PPOStats,
+        advantages: torch.Tensor,
+        returns: torch.Tensor,
+        kl: torch.Tensor,
+        kl_rewards: torch.Tensor,
+    ) -> None:
+        """
+        Cleanup tensors after each PPO step to free up memory.
+        """
+        # there shouldn't be any floating references to the individual tensors at the this point, so gc can do its thing
+        for v in trajectory:
+            del v
+        del trajectory
+        for v in ppo_stats:
+            del v
+        del ppo_stats
+        del advantages
+        del returns
+        del kl
+        del kl_rewards
+
+    def cleanup(self, **kwargs) -> None:
+        self._metric_logger.close()
+
+
+@config.parse
+def recipe_main(cfg: DictConfig) -> None:
+    """
+    Entry point for the recipe.
+
+    Configurable parameters are read in the following order:
+        - Parameters specified in config (see available configs through ``tune ls``)
+        - Overwritten by arguments from the command-line
+    """
+    config.log_config(recipe_name="PPOFullFinetuneRecipeSingleDevice", cfg=cfg)
+    recipe = PPOFullFinetuneRecipeSingleDevice(cfg=cfg)
+    recipe.setup(cfg=cfg)
+    recipe.train()
+    recipe.cleanup()
+
+
+if __name__ == "__main__":
+    sys.exit(recipe_main())
diff --git a/benchmarks/llm/requirements.cuda.txt b/benchmarks/llm/requirements.cuda.txt
index b6c9752f0..9b54a8464 100644
--- a/benchmarks/llm/requirements.cuda.txt
+++ b/benchmarks/llm/requirements.cuda.txt
@@ -9,11 +9,11 @@
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --trusted-host pypi.ngc.nvidia.com
 
-aiohappyeyeballs==2.3.4
+aiohappyeyeballs==2.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-aiohttp==3.10.0
+aiohttp==3.10.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -38,11 +38,11 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-blobfile==2.1.1
+blobfile==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchtune
@@ -58,7 +58,7 @@ codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.20.0
+datasets==2.21.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchtune
@@ -84,7 +84,7 @@ frozenlist==1.4.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -99,17 +99,17 @@ hjson==3.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
-huggingface-hub==0.24.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   torchtune
-idna==3.7
+idna==3.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   yarl
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
@@ -117,7 +117,7 @@ jinja2==3.1.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-lxml==4.9.4
+lxml==5.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blobfile
@@ -196,6 +196,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -214,7 +218,7 @@ omegaconf==2.3.0
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchtune
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -239,10 +243,6 @@ pyarrow==17.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-pyarrow-hotfix==0.6
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   datasets
 pycryptodomex==3.20.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -251,10 +251,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -263,7 +259,7 @@ pytz==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llm/requirements.in
@@ -284,11 +280,11 @@ requests==2.32.3
     #   datasets
     #   huggingface-hub
     #   tiktoken
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchtune
@@ -301,7 +297,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   python-dateutil
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -321,7 +317,7 @@ torchtune==0.2.1+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llm/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -355,7 +351,7 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/llm/requirements.in
-xxhash==3.4.1
+xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
diff --git a/benchmarks/llm/requirements.rocm.txt b/benchmarks/llm/requirements.rocm.txt
new file mode 100644
index 000000000..ab5098d08
--- /dev/null
+++ b/benchmarks/llm/requirements.rocm.txt
@@ -0,0 +1,306 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/llm/requirements.rocm.txt .pin/tmp-constraints-rocm-llm-full-mp-nodes.txt benchmarks/llm/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.in
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+blobfile==2.1.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+datasets==2.21.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   multiprocess
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   blobfile
+    #   datasets
+    #   huggingface-hub
+    #   pytorch-triton-rocm
+    #   torch
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   argklass
+huggingface-hub==0.24.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   torchtune
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+    #   yarl
+importlib-resources==6.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   argklass
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+lxml==4.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   blobfile
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   pandas
+    #   pyarrow
+    #   torchtune
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+    #   voir
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+pycryptodomex==3.20.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   blobfile
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.in
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+regex==2024.7.24
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tiktoken
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   tiktoken
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+safetensors==0.4.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+sentencepiece==0.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tiktoken==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.in
+torchao==0.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchtune
+torchtune==0.2.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torchtune
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   reactivex
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   blobfile
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llm/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
diff --git a/benchmarks/recursiongfn/Makefile b/benchmarks/recursiongfn/Makefile
new file mode 100644
index 000000000..892657d09
--- /dev/null
+++ b/benchmarks/recursiongfn/Makefile
@@ -0,0 +1,31 @@
+# Use global base if possible
+ifndef MILABENCH_BASE
+	MILABENCH_BASE="base"
+endif
+
+export MILABENCH_BASE
+
+BENCH_NAME=recursiongfn
+MILABENCH_CONFIG=dev.yaml
+MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE)
+
+all:
+	install prepare single gpus nodes
+
+install:
+	milabench install $(MILABENCH_ARGS) --force
+
+prepare:
+	milabench prepare $(MILABENCH_ARGS)
+
+tests: # install prepare
+	milabench run $(MILABENCH_ARGS)
+
+single:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single
+
+gpus:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus
+
+nodes:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes
diff --git a/benchmarks/recursiongfn/README.md b/benchmarks/recursiongfn/README.md
new file mode 100644
index 000000000..b8db87236
--- /dev/null
+++ b/benchmarks/recursiongfn/README.md
@@ -0,0 +1,4 @@
+
+# Recursiongfn
+
+Rewrite this README to explain what the benchmark is!
diff --git a/benchmarks/recursiongfn/benchfile.py b/benchmarks/recursiongfn/benchfile.py
new file mode 100644
index 000000000..1d5d46351
--- /dev/null
+++ b/benchmarks/recursiongfn/benchfile.py
@@ -0,0 +1,34 @@
+from milabench.pack import Package
+
+
+URL = "https://github.com/Delaunay/gflownet/"
+BRANCH = "milabench"
+
+class Recursiongfn(Package):
+    # Requirements file installed by install(). It can be empty or absent.
+    base_requirements = "requirements.in"
+
+    # The preparation script called by prepare(). It must be executable,
+    # but it can be any type of script. It can be empty or absent.
+    prepare_script = "prepare.py"
+
+    # The main script called by run(). It must be a Python file. It has to
+    # be present.
+    main_script = "main.py"
+
+    # You can remove the functions below if you don't need to modify them.
+    def clone(self):
+        gflownet = self.dirs.code / "gflownet"
+        if not gflownet.exists():
+            gflownet.clone_subtree(URL, BRANCH)
+
+    async def install(self):
+        self.clone()
+        await super().install()  # super() call installs the requirements
+
+    async def prepare(self):
+        await super().prepare()  # super() call executes prepare_script
+
+
+
+__pack__ = Recursiongfn
diff --git a/benchmarks/recursiongfn/dev.yaml b/benchmarks/recursiongfn/dev.yaml
new file mode 100644
index 000000000..8730968ff
--- /dev/null
+++ b/benchmarks/recursiongfn/dev.yaml
@@ -0,0 +1,15 @@
+
+recursiongfn:
+  inherits: _defaults
+  definition: .
+  install-variant: unpinned
+  install_group: torch
+  plan:
+    method: per_gpu
+
+  argv:
+    --batch_size: 128
+    --num_workers: 8
+    --num_steps: 100
+    --layer_width: 128
+    --num_layers: 4
diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py
new file mode 100644
index 000000000..81d08e8aa
--- /dev/null
+++ b/benchmarks/recursiongfn/main.py
@@ -0,0 +1,169 @@
+# This is the script run by milabench run (by default)
+# It is possible to use a script from a GitHub repo if it is cloned using
+# clone_subtree in the benchfile.py, in which case this file can simply
+# be deleted.
+
+import datetime
+import os
+from pathlib import Path
+from typing import Callable
+
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), "gflownet", "src"))
+
+import numpy as np
+import torch.nn as nn
+import torchcompat.core as accelerator
+from gflownet.config import Config, init_empty
+from gflownet.models import bengio2021flow
+from gflownet.tasks.seh_frag import SEHFragTrainer, SEHTask
+from gflownet.utils.conditioning import TemperatureConditional
+from gflownet.utils.misc import get_worker_device
+from torch import Tensor
+from torch.utils.data import DataLoader, Dataset
+
+from benchmate.observer import BenchObserver
+
+
+class SEHFragTrainerMonkeyPatch(SEHFragTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.batch_size_in_nodes = []
+
+        def batch_size(x):
+            """Measures the batch size as the sum of all nodes in the batch."""
+            return self.batch_size_in_nodes.pop()
+
+        self.observer = BenchObserver(
+            accelerator.Event,
+            earlystop=65,
+            batch_size_fn=batch_size,
+            raise_stop_program=False,
+            stdout=False,
+        )
+
+    def _maybe_resolve_shared_buffer(self, *args, **kwargs):
+        batch = super()._maybe_resolve_shared_buffer(*args, **kwargs)
+
+        # Accumulate the size of all graphs in the batch measured in nodes.
+        acc = 0
+        n = len(batch)
+        for i in range(n):
+            elem = batch[i]
+            acc += elem.x.shape[0]
+
+        self.batch_size_in_nodes.append(acc)
+        return batch
+
+    def step(self, loss: Tensor):
+        original_output = super().step(loss)
+        self.observer.record_loss(loss)
+        return original_output
+
+    def build_training_data_loader(self) -> DataLoader:
+        original_output = super().build_training_data_loader()
+        return self.observer.loader(original_output)
+
+    def setup_task(self):
+        self.task = SEHTaskMonkeyPatch(
+            dataset=self.training_data,
+            cfg=self.cfg,
+            rng=self.rng,
+            wrap_model=self._wrap_for_mp,
+        )
+
+
+class SEHTaskMonkeyPatch(SEHTask):
+    """Allows us to specify the location of the original model download."""
+
+    def __init__(
+        self,
+        dataset: Dataset,
+        cfg: Config,
+        rng: np.random.Generator = None,
+        wrap_model: Callable[[nn.Module], nn.Module] = None,
+    ):
+        self._wrap_model = wrap_model
+        self.rng = rng
+        self.models = self._load_task_models()
+        self.dataset = dataset
+        self.temperature_conditional = TemperatureConditional(cfg, rng)
+        self.num_cond_dim = self.temperature_conditional.encoding_size()
+
+    def _load_task_models(self):
+        xdg_cache = os.environ["XDG_CACHE_HOME"]
+        model = bengio2021flow.load_original_model(
+            cache=True,
+            location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")),
+        )
+        model.to(get_worker_device())
+        model = self._wrap_model(model)
+        return {"seh": model}
+
+
+def main(
+    batch_size: int, num_workers: int, num_steps: int, layer_width: int, num_layers: int
+):
+    # This script runs on an A100 with 8 cpus and 32Gb memory, but the A100 is probably
+    # overkill here. VRAM peaks at 6Gb and GPU usage peaks at 25%.
+
+    config = init_empty(Config())
+    config.print_every = 1
+    config.log_dir = f"./logs/debug_run_seh_frag_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
+    config.device = accelerator.fetch_device(0)  # This is your CUDA device.
+    config.overwrite_existing_exp = True
+
+    config.num_training_steps = num_steps  # Change this to train for longer.
+    config.checkpoint_every = 5  # 500
+    config.validate_every = 0
+    config.num_final_gen_steps = 0
+    config.opt.lr_decay = 20_000
+    config.opt.clip_grad_type = "total_norm"
+    config.algo.sampling_tau = 0.9
+    config.cond.temperature.sample_dist = "constant"
+    config.cond.temperature.dist_params = [64.0]
+    config.replay.use = False
+
+    # Things it may be fun to play with.
+    config.num_workers = num_workers
+    config.model.num_emb = layer_width
+    config.model.num_layers = num_layers
+    batch_size = batch_size
+
+    if config.replay.use:
+        config.algo.num_from_policy = 0
+        config.replay.num_new_samples = batch_size
+        config.replay.num_from_replay = batch_size
+    else:
+        config.algo.num_from_policy = batch_size
+
+    # This may need to be adjusted if the batch_size is made bigger
+    config.mp_buffer_size = 32 * 1024**2  # 32Mb
+    trial = SEHFragTrainerMonkeyPatch(config, print_config=False)
+    trial.run()
+    trial.terminate()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Description of your program")
+    parser.add_argument("-b", "--batch_size", help="Batch Size", default=128)
+    parser.add_argument("-n", "--num_workers", help="Number of Workers", default=8)
+    parser.add_argument(
+        "-s", "--num_steps", help="Number of Training Steps", default=100
+    )
+    parser.add_argument(
+        "-w", "--layer_width", help="Width of each policy hidden layer", default=128
+    )
+    parser.add_argument("-l", "--num_layers", help="Number of hidden layers", default=4)
+    args = parser.parse_args()
+
+    main(
+        args.batch_size,
+        args.num_workers,
+        args.num_steps,
+        args.layer_width,
+        args.num_layers,
+    )
diff --git a/benchmarks/recursiongfn/prepare.py b/benchmarks/recursiongfn/prepare.py
new file mode 100755
index 000000000..89cafada3
--- /dev/null
+++ b/benchmarks/recursiongfn/prepare.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+import os
+
+from pathlib import Path
+
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), "gflownet", "src"))
+
+
+if __name__ == "__main__":
+    from gflownet.models.bengio2021flow import load_original_model
+    
+    # If you need the whole configuration:
+    # config = json.loads(os.environ["MILABENCH_CONFIG"])
+    print("+ Full environment:\n{}\n***".format(os.environ))
+
+    #milabench_cfg = os.environ["MILABENCH_CONFIG"]
+    #print(milabench_cfg)
+
+    xdg_cache = os.environ["XDG_CACHE_HOME"]
+
+    print("+ Loading proxy model weights to MILABENCH_DIR_DATA={}".format(xdg_cache))
+    _ = load_original_model(
+        cache=True,
+        location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")),
+        )
+
diff --git a/benchmarks/recursiongfn/requirements.cuda.txt b/benchmarks/recursiongfn/requirements.cuda.txt
new file mode 100644
index 000000000..1aef7b7fb
--- /dev/null
+++ b/benchmarks/recursiongfn/requirements.cuda.txt
@@ -0,0 +1,495 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/recursiongfn/requirements.cuda.txt .pin/tmp-constraints-cuda-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in
+#
+--extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--trusted-host pypi.ngc.nvidia.com
+
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tensorboard
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+blosc2==2.7.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tables
+botorch==0.11.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+click==8.1.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   wandb
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   ptera
+cvxopt==1.3.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+docker-pycreds==0.4.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   wandb
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+    #   torch-geometric
+gitdb==4.0.11
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   gitpython
+gitpython==3.1.43
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   wandb
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   ptera
+    #   voir
+gpytorch==1.12
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+grpcio==1.66.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tensorboard
+idna==3.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+    #   yarl
+jaxtyping==0.2.33
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   linear-operator
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   scikit-learn
+linear-operator==0.5.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   botorch
+    #   gpytorch
+markdown==3.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tensorboard
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   botorch
+    #   gpytorch
+    #   sympy
+msgpack==1.0.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   blosc2
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   botorch
+ndindex==1.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   blosc2
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torch
+numexpr==2.10.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   blosc2
+    #   tables
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   blosc2
+    #   botorch
+    #   numexpr
+    #   opt-einsum
+    #   pandas
+    #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   tables
+    #   tensorboard
+    #   torch-geometric
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   voir
+opt-einsum==3.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pyro-ppl
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tables
+    #   tensorboard
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   rdkit
+platformdirs==4.2.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   wandb
+protobuf==5.27.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tensorboard
+    #   wandb
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+py-cpuinfo==9.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   blosc2
+    #   tables
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   rich
+pyparsing==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+pyro-api==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pandas
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   omegaconf
+    #   wandb
+rdkit==2024.3.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch-geometric
+    #   wandb
+rich==13.8.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   voir
+scikit-learn==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   gpytorch
+    #   torch-geometric
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   gpytorch
+    #   linear-operator
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-geometric
+    #   torch-sparse
+sentry-sdk==2.13.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   wandb
+setproctitle==1.3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   wandb
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   asttokens
+    #   docker-pycreds
+    #   python-dateutil
+    #   tensorboard
+smmap==5.0.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   gitdb
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+tables==3.10.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tensorboard==2.17.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tensorboard-data-server==0.7.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tensorboard
+threadpoolctl==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   scikit-learn
+torch==2.3.1+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   linear-operator
+    #   pyro-ppl
+torch-cluster==1.6.3+pt23cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.5.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-scatter==2.1.2+pt23cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-sparse==0.6.18+pt23cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pyro-ppl
+    #   torch-geometric
+triton==2.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   torch
+typeguard==2.13.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   jaxtyping
+    #   linear-operator
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   reactivex
+    #   tables
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   requests
+    #   sentry-sdk
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+wandb==0.17.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+werkzeug==3.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   tensorboard
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/recursiongfn/requirements.in b/benchmarks/recursiongfn/requirements.in
new file mode 100644
index 000000000..77df598d3
--- /dev/null
+++ b/benchmarks/recursiongfn/requirements.in
@@ -0,0 +1,20 @@
+voir>=0.2.17,<0.3
+torch
+torch-geometric
+torch-scatter
+torch-sparse
+torch-cluster
+rdkit
+tables
+scipy
+networkx
+tensorboard
+cvxopt
+pyarrow
+gitpython
+botorch
+pyro-ppl
+gpytorch
+omegaconf>=2.3
+wandb
+pandas
diff --git a/benchmarks/recursiongfn/requirements.rocm.txt b/benchmarks/recursiongfn/requirements.rocm.txt
new file mode 100644
index 000000000..1bc73f14e
--- /dev/null
+++ b/benchmarks/recursiongfn/requirements.rocm.txt
@@ -0,0 +1,445 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/recursiongfn/requirements.rocm.txt .pin/tmp-constraints-rocm-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tensorboard
+aiohappyeyeballs==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+aiohttp==3.10.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+blosc2==2.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tables
+botorch==0.11.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+certifi==2024.7.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+click==8.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   wandb
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   ptera
+cvxopt==1.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+docker-pycreds==0.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   wandb
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   varname
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pytorch-triton-rocm
+    #   torch
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+    #   torch-geometric
+gflownet @ git+https://github.com/Delaunay/gflownet@milabench
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+gitdb==4.0.11
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gitpython
+gitpython==3.1.43
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+    #   wandb
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   ptera
+    #   voir
+gpytorch==1.12
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   botorch
+    #   gflownet
+grpcio==1.65.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tensorboard
+idna==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+    #   yarl
+jaxtyping==0.2.33
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   linear-operator
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   scikit-learn
+linear-operator==0.5.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   botorch
+    #   gpytorch
+markdown==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tensorboard
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   botorch
+    #   gpytorch
+    #   sympy
+msgpack==1.0.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   blosc2
+multidict==6.0.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   botorch
+ndindex==1.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   blosc2
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+    #   torch
+numexpr==2.10.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   blosc2
+    #   tables
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   blosc2
+    #   botorch
+    #   numexpr
+    #   opt-einsum
+    #   pandas
+    #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   tables
+    #   tensorboard
+    #   torch-geometric
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+    #   voir
+opt-einsum==3.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pyro-ppl
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tables
+    #   tensorboard
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   rdkit
+platformdirs==4.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   wandb
+protobuf==5.27.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tensorboard
+    #   wandb
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+py-cpuinfo==9.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   blosc2
+    #   tables
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+pyparsing==3.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+pyro-api==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   botorch
+    #   gflownet
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+pytz==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   omegaconf
+    #   wandb
+rdkit==2024.3.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch-geometric
+    #   wandb
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   voir
+scikit-learn==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gpytorch
+    #   torch-geometric
+scipy==1.14.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   botorch
+    #   gflownet
+    #   gpytorch
+    #   linear-operator
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-geometric
+    #   torch-sparse
+sentry-sdk==2.13.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   wandb
+setproctitle==1.3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   wandb
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   asttokens
+    #   docker-pycreds
+    #   python-dateutil
+    #   tensorboard
+smmap==5.0.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gitdb
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   torch
+tables==3.10.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+tensorboard==2.17.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+tensorboard-data-server==0.7.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tensorboard
+threadpoolctl==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   scikit-learn
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   gflownet
+    #   linear-operator
+    #   pyro-ppl
+torch-cluster==1.6.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+torch-geometric==2.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+torch-scatter==2.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+torch-sparse==0.6.18
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pyro-ppl
+    #   torch-geometric
+typeguard==2.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   jaxtyping
+    #   linear-operator
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   reactivex
+    #   tables
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   pandas
+urllib3==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   requests
+    #   sentry-sdk
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+wandb==0.17.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   gflownet
+werkzeug==3.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   tensorboard
+yarl==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/recursiongfn/voirfile.py b/benchmarks/recursiongfn/voirfile.py
new file mode 100644
index 000000000..d93f886cd
--- /dev/null
+++ b/benchmarks/recursiongfn/voirfile.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+from voir import configurable
+from voir.instruments import dash, early_stop, log, rate
+from benchmate.monitor import monitor_monogpu
+
+@dataclass
+class Config:
+    """voir configuration"""
+
+    # Whether to display the dash or not
+    dash: bool = False
+
+    # How often to log the rates
+    interval: str = "1s"
+
+    # Number of rates to skip before logging
+    skip: int = 5
+
+    # Number of rates to log before stopping
+    stop: int = 20
+
+    # Number of seconds between each gpu poll
+    gpu_poll: int = 3
+
+
+@configurable
+def instrument_main(ov, options: Config):
+    yield ov.phases.init
+
+    if options.dash:
+        ov.require(dash)
+
+    ov.require(
+        log("value", "progress", "rate", "units", "loss", "gpudata", context="task"),
+        early_stop(n=options.stop, key="rate", task="train"),
+        monitor_monogpu(poll_interval=options.gpu_poll),
+    )
diff --git a/benchmarks/stargan/README.md b/benchmarks/retired/stargan/README.md
similarity index 100%
rename from benchmarks/stargan/README.md
rename to benchmarks/retired/stargan/README.md
diff --git a/benchmarks/stargan/benchfile.py b/benchmarks/retired/stargan/benchfile.py
similarity index 100%
rename from benchmarks/stargan/benchfile.py
rename to benchmarks/retired/stargan/benchfile.py
diff --git a/benchmarks/stargan/prepare.py b/benchmarks/retired/stargan/prepare.py
similarity index 100%
rename from benchmarks/stargan/prepare.py
rename to benchmarks/retired/stargan/prepare.py
diff --git a/benchmarks/stargan/requirements.cuda.txt b/benchmarks/retired/stargan/requirements.cuda.txt
similarity index 100%
rename from benchmarks/stargan/requirements.cuda.txt
rename to benchmarks/retired/stargan/requirements.cuda.txt
diff --git a/benchmarks/stargan/requirements.hpu.txt b/benchmarks/retired/stargan/requirements.hpu.txt
similarity index 100%
rename from benchmarks/stargan/requirements.hpu.txt
rename to benchmarks/retired/stargan/requirements.hpu.txt
diff --git a/benchmarks/stargan/requirements.in b/benchmarks/retired/stargan/requirements.in
similarity index 100%
rename from benchmarks/stargan/requirements.in
rename to benchmarks/retired/stargan/requirements.in
diff --git a/benchmarks/stargan/requirements.rocm.txt b/benchmarks/retired/stargan/requirements.rocm.txt
similarity index 100%
rename from benchmarks/stargan/requirements.rocm.txt
rename to benchmarks/retired/stargan/requirements.rocm.txt
diff --git a/benchmarks/stargan/requirements.xpu.txt b/benchmarks/retired/stargan/requirements.xpu.txt
similarity index 100%
rename from benchmarks/stargan/requirements.xpu.txt
rename to benchmarks/retired/stargan/requirements.xpu.txt
diff --git a/benchmarks/stargan/stargan/LICENSE b/benchmarks/retired/stargan/stargan/LICENSE
similarity index 100%
rename from benchmarks/stargan/stargan/LICENSE
rename to benchmarks/retired/stargan/stargan/LICENSE
diff --git a/benchmarks/stargan/stargan/ORIGIN.md b/benchmarks/retired/stargan/stargan/ORIGIN.md
similarity index 100%
rename from benchmarks/stargan/stargan/ORIGIN.md
rename to benchmarks/retired/stargan/stargan/ORIGIN.md
diff --git a/benchmarks/stargan/stargan/README.md b/benchmarks/retired/stargan/stargan/README.md
similarity index 100%
rename from benchmarks/stargan/stargan/README.md
rename to benchmarks/retired/stargan/stargan/README.md
diff --git a/benchmarks/stargan/stargan/data_loader.py b/benchmarks/retired/stargan/stargan/data_loader.py
similarity index 100%
rename from benchmarks/stargan/stargan/data_loader.py
rename to benchmarks/retired/stargan/stargan/data_loader.py
diff --git a/benchmarks/stargan/stargan/download.sh b/benchmarks/retired/stargan/stargan/download.sh
similarity index 100%
rename from benchmarks/stargan/stargan/download.sh
rename to benchmarks/retired/stargan/stargan/download.sh
diff --git a/benchmarks/stargan/stargan/logger.py b/benchmarks/retired/stargan/stargan/logger.py
similarity index 100%
rename from benchmarks/stargan/stargan/logger.py
rename to benchmarks/retired/stargan/stargan/logger.py
diff --git a/benchmarks/stargan/stargan/main.py b/benchmarks/retired/stargan/stargan/main.py
similarity index 100%
rename from benchmarks/stargan/stargan/main.py
rename to benchmarks/retired/stargan/stargan/main.py
diff --git a/benchmarks/stargan/stargan/model.py b/benchmarks/retired/stargan/stargan/model.py
similarity index 100%
rename from benchmarks/stargan/stargan/model.py
rename to benchmarks/retired/stargan/stargan/model.py
diff --git a/benchmarks/stargan/stargan/solver.py b/benchmarks/retired/stargan/stargan/solver.py
similarity index 100%
rename from benchmarks/stargan/stargan/solver.py
rename to benchmarks/retired/stargan/stargan/solver.py
diff --git a/benchmarks/stargan/stargan/synth.py b/benchmarks/retired/stargan/stargan/synth.py
similarity index 100%
rename from benchmarks/stargan/stargan/synth.py
rename to benchmarks/retired/stargan/stargan/synth.py
diff --git a/benchmarks/stargan/voirfile.py b/benchmarks/retired/stargan/voirfile.py
similarity index 100%
rename from benchmarks/stargan/voirfile.py
rename to benchmarks/retired/stargan/voirfile.py
diff --git a/benchmarks/super-slomo/requirements.cuda.txt b/benchmarks/super-slomo/requirements.cuda.txt
index e4e207911..88c4880e5 100644
--- a/benchmarks/super-slomo/requirements.cuda.txt
+++ b/benchmarks/super-slomo/requirements.cuda.txt
@@ -30,7 +30,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -108,6 +108,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -129,7 +133,7 @@ opencv-python==4.10.0.84
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -149,11 +153,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
@@ -161,7 +161,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -169,7 +169,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -182,7 +182,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
diff --git a/benchmarks/super-slomo/requirements.rocm.txt b/benchmarks/super-slomo/requirements.rocm.txt
index 4b1ce5b0c..d85fcf14a 100644
--- a/benchmarks/super-slomo/requirements.rocm.txt
+++ b/benchmarks/super-slomo/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/super-slomo/requirements.rocm.txt .pin/tmp-constraints-rocm-super-slomo.txt benchmarks/super-slomo/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -30,7 +27,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
     #   torch
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -77,7 +74,7 @@ opencv-python==4.10.0.84
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -101,11 +98,11 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
@@ -121,20 +118,20 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
     #   torchvision
-torchvision==0.18.1+rocm6.0
+torchvision==0.19.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
diff --git a/benchmarks/super-slomo/requirements.xpu.txt b/benchmarks/super-slomo/requirements.xpu.txt
index 2053d45b6..1a40b14fb 100644
--- a/benchmarks/super-slomo/requirements.xpu.txt
+++ b/benchmarks/super-slomo/requirements.xpu.txt
@@ -4,10 +4,8 @@
 #
 #    pip-compile --output-file=benchmarks/super-slomo/requirements.xpu.txt .pin/tmp-constraints-xpu-super-slomo.txt benchmarks/super-slomo/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,14 +15,6 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-certifi==2024.6.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
-charset-normalizer==3.3.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -46,10 +36,6 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-idna==3.7
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 jinja2==3.1.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -88,7 +74,7 @@ opencv-python==4.10.0.84
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -112,7 +98,7 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   omegaconf
@@ -120,10 +106,6 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-requests==2.32.3
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   torchvision
 rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -132,22 +114,24 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/super-slomo/requirements.in
     #   torchvision
-torchvision==0.16.0.post2+cxx11.abi
+torchvision==0.19.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/super-slomo/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/super-slomo/requirements.in
@@ -156,10 +140,6 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   reactivex
     #   torch
-urllib3==1.26.19
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -167,5 +147,6 @@ varname==0.10.0
 voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
+    #   -c .pin/../constraints/extra/torch.xpu.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/super-slomo/requirements.in
diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py
index 52a31ba1d..df36de07e 100644
--- a/benchmarks/timm/benchfile.py
+++ b/benchmarks/timm/benchfile.py
@@ -30,9 +30,6 @@ async def install(self):
                 "https://github.com/huggingface/pytorch-image-models", BRANCH
             )
 
-        # Install TIMM first
-        # await self.pip_install("-e", str(timm))
-
         # install the rest, which might override what TIMM specified
         await super().install()
 
diff --git a/benchmarks/timm/requirements.cuda.txt b/benchmarks/timm/requirements.cuda.txt
index 84f2f328d..d7e100fb2 100644
--- a/benchmarks/timm/requirements.cuda.txt
+++ b/benchmarks/timm/requirements.cuda.txt
@@ -39,7 +39,7 @@ filelock==3.15.4
     #   huggingface-hub
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
@@ -49,11 +49,11 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/timm/requirements.in
-idna==3.7
+idna==3.8
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -124,6 +124,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -141,7 +145,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -165,11 +169,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -183,11 +183,11 @@ requests==2.32.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -195,7 +195,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -208,7 +208,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/timm/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
diff --git a/benchmarks/timm/requirements.rocm.txt b/benchmarks/timm/requirements.rocm.txt
index a27a7da2d..8383f9e6b 100644
--- a/benchmarks/timm/requirements.rocm.txt
+++ b/benchmarks/timm/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/timm/requirements.rocm.txt .pin/tmp-constraints-rocm-timm.txt benchmarks/timm/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,7 +14,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-certifi==2024.6.2
+certifi==2024.7.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -39,7 +36,7 @@ filelock==3.15.4
     #   huggingface-hub
     #   pytorch-triton-rocm
     #   torch
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -49,7 +46,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.23.5
+huggingface-hub==0.24.6
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -89,7 +86,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -117,11 +114,11 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -139,7 +136,7 @@ rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -147,20 +144,20 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
     #   torchvision
-torchvision==0.18.1+rocm6.0
+torchvision==0.19.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -170,7 +167,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   reactivex
     #   torch
-urllib3==1.26.19
+urllib3==2.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
diff --git a/benchmarks/timm/requirements.xpu.txt b/benchmarks/timm/requirements.xpu.txt
index d71eb8433..d7f993372 100644
--- a/benchmarks/timm/requirements.xpu.txt
+++ b/benchmarks/timm/requirements.xpu.txt
@@ -4,10 +4,9 @@
 #
 #    pip-compile --output-file=benchmarks/timm/requirements.xpu.txt .pin/tmp-constraints-xpu-timm.txt benchmarks/timm/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,7 +16,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-certifi==2024.6.2
+certifi==2024.7.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   requests
@@ -48,7 +47,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.0
+huggingface-hub==0.24.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -88,7 +87,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -116,7 +115,7 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -130,12 +129,11 @@ requests==2.32.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   huggingface-hub
-    #   torchvision
 rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -143,22 +141,22 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/timm/requirements.in
     #   torchvision
-torchvision==0.16.0.post2+cxx11.abi
+torchvision==0.19.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/timm/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   huggingface-hub
@@ -168,7 +166,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   reactivex
     #   torch
-urllib3==1.26.19
+urllib3==2.2.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   requests
diff --git a/benchmarks/torchatari/Makefile b/benchmarks/torchatari/Makefile
new file mode 100644
index 000000000..9eb0a30c5
--- /dev/null
+++ b/benchmarks/torchatari/Makefile
@@ -0,0 +1,31 @@
+# Use global base if possible
+ifndef MILABENCH_BASE
+	MILABENCH_BASE="base"
+endif
+
+export MILABENCH_BASE
+
+BENCH_NAME=torchatari
+MILABENCH_CONFIG=dev.yaml
+MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE)
+
+all:
+	install prepare single gpus nodes
+
+install:
+	milabench install $(MILABENCH_ARGS) --force
+
+prepare:
+	milabench prepare $(MILABENCH_ARGS)
+
+tests: 
+	MILABENCH_CPU_AUTO=1 CUDA_VISIBLE_DEVICES=0,1 milabench run $(MILABENCH_ARGS)
+
+single:
+	MILABENCH_CPU_AUTO=1 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)
+
+gpus:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus
+
+nodes:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes
diff --git a/benchmarks/torchatari/README.md b/benchmarks/torchatari/README.md
new file mode 100644
index 000000000..44de20162
--- /dev/null
+++ b/benchmarks/torchatari/README.md
@@ -0,0 +1,4 @@
+
+# Torch_ppo_atari_envpool
+
+Rewrite this README to explain what the benchmark is!
diff --git a/benchmarks/torchatari/benchfile.py b/benchmarks/torchatari/benchfile.py
new file mode 100644
index 000000000..1bf4ee785
--- /dev/null
+++ b/benchmarks/torchatari/benchfile.py
@@ -0,0 +1,31 @@
+from milabench.pack import Package
+
+
+class Torchatari(Package):
+    # Requirements file installed by install(). It can be empty or absent.
+    base_requirements = "requirements.in"
+
+    # The preparation script called by prepare(). It must be executable,
+    # but it can be any type of script. It can be empty or absent.
+    prepare_script = "prepare.py"
+
+    # The main script called by run(). It must be a Python file. It has to
+    # be present.
+    main_script = "main.py"
+
+    # You can remove the functions below if you don't need to modify them.
+
+    def make_env(self):
+        # Return a dict of environment variables for prepare_script and
+        # main_script.
+        return super().make_env()
+
+    async def install(self):
+        await super().install()  # super() call installs the requirements
+
+    async def prepare(self):
+        await super().prepare()  # super() call executes prepare_script
+
+
+
+__pack__ = Torchatari
diff --git a/benchmarks/torchatari/dev.yaml b/benchmarks/torchatari/dev.yaml
new file mode 100644
index 000000000..d0df0df1a
--- /dev/null
+++ b/benchmarks/torchatari/dev.yaml
@@ -0,0 +1,17 @@
+
+torchatari:
+  max_duration: 600
+  inherits: _defaults
+  definition: .
+  install-variant: unpinned
+  install_group: torch
+  plan:
+    method: per_gpu
+
+  argv:
+    --num-minibatches: 16
+    --update-epochs: 4
+    --num-steps: 128
+    --num-envs: auto({cpu_per_gpu}, 128)
+    --total-timesteps: 1000000
+    --env-id: Breakout-v5
\ No newline at end of file
diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py
new file mode 100644
index 000000000..62c9b3a07
--- /dev/null
+++ b/benchmarks/torchatari/main.py
@@ -0,0 +1,349 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_atari_envpoolpy
+import os
+import random
+import time
+from collections import deque
+from dataclasses import dataclass
+
+import envpool
+import gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import tyro
+from torch.distributions.categorical import Categorical
+from torch.utils.tensorboard import SummaryWriter
+import torchcompat.core as acc
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 1
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = False
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "cleanRL"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = False
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Breakout-v5"
+    """the id of the environment"""
+    total_timesteps: int = 10000000
+    """total timesteps of the experiments"""
+    learning_rate: float = 2.5e-4
+    """the learning rate of the optimizer"""
+    num_envs: int = 128
+    """the number of parallel game environments"""
+    num_steps: int = 128
+    """the number of steps to run in each environment per policy rollout"""
+    anneal_lr: bool = True
+    """Toggle learning rate annealing for policy and value networks"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    gae_lambda: float = 0.95
+    """the lambda for the general advantage estimation"""
+    num_minibatches: int = 16
+    """the number of mini-batches"""
+    update_epochs: int = 4
+    """the K epochs to update the policy"""
+    norm_adv: bool = True
+    """Toggles advantages normalization"""
+    clip_coef: float = 0.1
+    """the surrogate clipping coefficient"""
+    clip_vloss: bool = True
+    """Toggles whether or not to use a clipped loss for the value function, as per the paper."""
+    ent_coef: float = 0.01
+    """coefficient of the entropy"""
+    vf_coef: float = 0.5
+    """coefficient of the value function"""
+    max_grad_norm: float = 0.5
+    """the maximum norm for the gradient clipping"""
+    target_kl: float = None
+    """the target KL divergence threshold"""
+
+    # to be filled in runtime
+    batch_size: int = 0
+    """the batch size (computed in runtime)"""
+    minibatch_size: int = 0
+    """the mini-batch size (computed in runtime)"""
+    num_iterations: int = 0
+    """the number of iterations (computed in runtime)"""
+
+
+class RecordEpisodeStatistics(gym.Wrapper):
+    def __init__(self, env, deque_size=100):
+        super().__init__(env)
+        self.num_envs = getattr(env, "num_envs", 1)
+        self.episode_returns = None
+        self.episode_lengths = None
+
+    def reset(self, **kwargs):
+        observations = super().reset(**kwargs)
+        self.episode_returns = np.zeros(self.num_envs, dtype=np.float32)
+        self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32)
+        self.lives = np.zeros(self.num_envs, dtype=np.int32)
+        self.returned_episode_returns = np.zeros(self.num_envs, dtype=np.float32)
+        self.returned_episode_lengths = np.zeros(self.num_envs, dtype=np.int32)
+        return observations
+
+    def step(self, action):
+        observations, rewards, dones, infos = super().step(action)
+        self.episode_returns += infos["reward"]
+        self.episode_lengths += 1
+        self.returned_episode_returns[:] = self.episode_returns
+        self.returned_episode_lengths[:] = self.episode_lengths
+        self.episode_returns *= 1 - infos["terminated"]
+        self.episode_lengths *= 1 - infos["terminated"]
+        infos["r"] = self.returned_episode_returns
+        infos["l"] = self.returned_episode_lengths
+        return (
+            observations,
+            rewards,
+            dones,
+            infos,
+        )
+
+
+def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
+    torch.nn.init.orthogonal_(layer.weight, std)
+    torch.nn.init.constant_(layer.bias, bias_const)
+    return layer
+
+
+class Agent(nn.Module):
+    def __init__(self, envs):
+        super().__init__()
+        self.network = nn.Sequential(
+            layer_init(nn.Conv2d(4, 32, 8, stride=4)),
+            nn.ReLU(),
+            layer_init(nn.Conv2d(32, 64, 4, stride=2)),
+            nn.ReLU(),
+            layer_init(nn.Conv2d(64, 64, 3, stride=1)),
+            nn.ReLU(),
+            nn.Flatten(),
+            layer_init(nn.Linear(64 * 7 * 7, 512)),
+            nn.ReLU(),
+        )
+        self.actor = layer_init(nn.Linear(512, envs.single_action_space.n), std=0.01)
+        self.critic = layer_init(nn.Linear(512, 1), std=1)
+
+    def get_value(self, x):
+        return self.critic(self.network(x / 255.0))
+
+    def get_action_and_value(self, x, action=None):
+        hidden = self.network(x / 255.0)
+        logits = self.actor(hidden)
+        probs = Categorical(logits=logits)
+        if action is None:
+            action = probs.sample()
+        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)
+
+
+def main():
+    args = tyro.cli(Args)
+    args.batch_size = int(args.num_envs * args.num_steps)
+    args.minibatch_size = int(args.batch_size // args.num_minibatches)
+    args.num_iterations = args.total_timesteps // args.batch_size
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+    if args.track:
+        import wandb
+
+        wandb.init(
+            project=args.wandb_project_name,
+            entity=args.wandb_entity,
+            sync_tensorboard=True,
+            config=vars(args),
+            name=run_name,
+            monitor_gym=True,
+            save_code=True,
+        )
+    writer = SummaryWriter(f"runs/{run_name}")
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+    )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = acc.fetch_device(0)
+
+    # env setup
+    envs = envpool.make(
+        args.env_id,
+        env_type="gym",
+        num_envs=args.num_envs,
+        episodic_life=True,
+        reward_clip=True,
+        seed=args.seed,
+    )
+    envs.num_envs = args.num_envs
+    envs.single_action_space = envs.action_space
+    envs.single_observation_space = envs.observation_space
+    envs = RecordEpisodeStatistics(envs)
+    assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported"
+
+    agent = Agent(envs).to(device)
+    optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
+
+    # ALGO Logic: Storage setup
+    obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
+    actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
+    logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
+    rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
+    dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
+    values = torch.zeros((args.num_steps, args.num_envs)).to(device)
+    avg_returns = deque(maxlen=20)
+
+    # TRY NOT TO MODIFY: start the game
+    global_step = 0
+    start_time = time.time()
+    next_obs = torch.Tensor(envs.reset()).to(device)
+    next_done = torch.zeros(args.num_envs).to(device)
+    iterations = range(1, args.num_iterations + 1)
+
+    for iteration in iterations:
+        # Annealing the rate if instructed to do so.
+        if args.anneal_lr:
+            frac = 1.0 - (iteration - 1.0) / args.num_iterations
+            lrnow = frac * args.learning_rate
+            optimizer.param_groups[0]["lr"] = lrnow
+
+        for step in range(0, args.num_steps):
+            global_step += args.num_envs
+            obs[step] = next_obs
+            dones[step] = next_done
+
+            # ALGO LOGIC: action logic
+            with torch.no_grad():
+                action, logprob, _, value = agent.get_action_and_value(next_obs)
+                values[step] = value.flatten()
+            actions[step] = action
+            logprobs[step] = logprob
+
+            # TRY NOT TO MODIFY: execute the game and log data.
+            next_obs, reward, next_done, info = envs.step(action.cpu().numpy())
+            rewards[step] = torch.tensor(reward).to(device).view(-1)
+            next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
+
+            for idx, d in enumerate(next_done):
+                if d and info["lives"][idx] == 0:
+                    # print(f"global_step={global_step}, episodic_return={info['r'][idx]}")
+                    avg_returns.append(info["r"][idx])
+                    writer.add_scalar("charts/avg_episodic_return", np.average(avg_returns), global_step)
+                    writer.add_scalar("charts/episodic_return", info["r"][idx], global_step)
+                    writer.add_scalar("charts/episodic_length", info["l"][idx], global_step)
+
+        # bootstrap value if not done
+        with torch.no_grad():
+            next_value = agent.get_value(next_obs).reshape(1, -1)
+            advantages = torch.zeros_like(rewards).to(device)
+            lastgaelam = 0
+            for t in reversed(range(args.num_steps)):
+                if t == args.num_steps - 1:
+                    nextnonterminal = 1.0 - next_done
+                    nextvalues = next_value
+                else:
+                    nextnonterminal = 1.0 - dones[t + 1]
+                    nextvalues = values[t + 1]
+                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
+                advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
+            returns = advantages + values
+
+        # flatten the batch
+        b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
+        b_logprobs = logprobs.reshape(-1)
+        b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
+        b_advantages = advantages.reshape(-1)
+        b_returns = returns.reshape(-1)
+        b_values = values.reshape(-1)
+
+        # Optimizing the policy and value network
+        b_inds = np.arange(args.batch_size)
+        clipfracs = []
+        for epoch in range(args.update_epochs):
+            np.random.shuffle(b_inds)
+            for start in range(0, args.batch_size, args.minibatch_size):
+                end = start + args.minibatch_size
+                mb_inds = b_inds[start:end]
+
+                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
+                logratio = newlogprob - b_logprobs[mb_inds]
+                ratio = logratio.exp()
+
+                with torch.no_grad():
+                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
+                    old_approx_kl = (-logratio).mean()
+                    approx_kl = ((ratio - 1) - logratio).mean()
+                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
+
+                mb_advantages = b_advantages[mb_inds]
+                if args.norm_adv:
+                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)
+
+                # Policy loss
+                pg_loss1 = -mb_advantages * ratio
+                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
+                pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+                # Value loss
+                newvalue = newvalue.view(-1)
+                if args.clip_vloss:
+                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
+                    v_clipped = b_values[mb_inds] + torch.clamp(
+                        newvalue - b_values[mb_inds],
+                        -args.clip_coef,
+                        args.clip_coef,
+                    )
+                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
+                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+                    v_loss = 0.5 * v_loss_max.mean()
+                else:
+                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
+
+                entropy_loss = entropy.mean()
+                loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef
+
+                optimizer.zero_grad()
+                loss.backward()
+                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
+                optimizer.step()
+
+            if args.target_kl is not None and approx_kl > args.target_kl:
+                break
+
+        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
+        var_y = np.var(y_true)
+        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
+        writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
+        writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
+        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
+        writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
+        writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
+        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
+        writer.add_scalar("losses/explained_variance", explained_var, global_step)
+        print("SPS:", int(global_step / (time.time() - start_time)))
+        writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+
+    envs.close()
+    writer.close()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmarks/torchatari/prepare.py b/benchmarks/torchatari/prepare.py
new file mode 100755
index 000000000..32bd5901d
--- /dev/null
+++ b/benchmarks/torchatari/prepare.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+
+import os
+
+if __name__ == "__main__":
+    # If you need the whole configuration:
+    # config = json.loads(os.environ["MILABENCH_CONFIG"])
+
+    data_directory = os.environ["MILABENCH_DIR_DATA"]
+
+    # Download (or generate) the needed dataset(s). You are responsible
+    # to check if it has already been properly downloaded or not, and to
+    # do nothing if it has been.
+    print("Hello I am doing some data stuff!")
+
+    # If there is nothing to download or generate, just delete this file.
diff --git a/benchmarks/torchatari/requirements.cuda.txt b/benchmarks/torchatari/requirements.cuda.txt
new file mode 100644
index 000000000..8f15c6635
--- /dev/null
+++ b/benchmarks/torchatari/requirements.cuda.txt
@@ -0,0 +1,308 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/torchatari/requirements.cuda.txt .pin/tmp-constraints-cuda-torchatari.txt benchmarks/torchatari/requirements.in
+#
+--extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--trusted-host pypi.ngc.nvidia.com
+
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   dm-env
+    #   tensorboard
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
+appdirs==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   cantilever
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+cantilever==0.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gym
+    #   gymnasium
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+dm-env==1.6
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   envpool
+dm-tree==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   dm-env
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
+envpool==0.8.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   varname
+farama-notifications==0.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gymnasium
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+    #   voir
+grpcio==1.66.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorboard
+gym==0.23.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+    #   envpool
+gym-notices==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gym
+gymnasium==0.29.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   envpool
+importlib-resources==6.4.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   cantilever
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+markdown==3.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorboard
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+    #   dm-env
+    #   envpool
+    #   gym
+    #   gymnasium
+    #   tensorboard
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+optree==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   envpool
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   envpool
+    #   tensorboard
+protobuf==5.27.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorboard
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+rich==13.8.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
+    #   voir
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   asttokens
+    #   tensorboard
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+tensorboard==2.17.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+tensorboard-data-server==0.7.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorboard
+torch==2.4.0+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/torchatari/requirements.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+types-protobuf==5.27.0.20240626
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   envpool
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   envpool
+    #   gymnasium
+    #   optree
+    #   reactivex
+    #   torch
+    #   tyro
+tyro==0.8.10
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/torchatari/requirements.in
+werkzeug==3.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorboard
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/torchatari/requirements.in b/benchmarks/torchatari/requirements.in
new file mode 100644
index 000000000..c264f5563
--- /dev/null
+++ b/benchmarks/torchatari/requirements.in
@@ -0,0 +1,9 @@
+envpool
+gym==0.23.1
+numpy 
+torch
+tyro
+voir
+tensorboard
+torchcompat
+cantilever
diff --git a/benchmarks/torchatari/requirements.rocm.txt b/benchmarks/torchatari/requirements.rocm.txt
new file mode 100644
index 000000000..71fd92e51
--- /dev/null
+++ b/benchmarks/torchatari/requirements.rocm.txt
@@ -0,0 +1,253 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/torchatari/requirements.rocm.txt .pin/tmp-constraints-rocm-torchatari.txt benchmarks/torchatari/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   dm-env
+    #   tensorboard
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+appdirs==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   cantilever
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+cantilever==0.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gym
+    #   gymnasium
+codefind==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+dm-env==1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   envpool
+dm-tree==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   dm-env
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+envpool==0.8.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+farama-notifications==0.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gymnasium
+filelock==3.15.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+    #   torch
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+grpcio==1.65.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorboard
+gym==0.23.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+    #   envpool
+gym-notices==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gym
+gymnasium==0.29.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   envpool
+importlib-resources==6.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   cantilever
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+markdown==3.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorboard
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+    #   dm-env
+    #   envpool
+    #   gym
+    #   gymnasium
+    #   tensorboard
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+optree==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   envpool
+ovld==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   envpool
+    #   tensorboard
+protobuf==5.27.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorboard
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pynvml==11.5.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+rich==13.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+    #   voir
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   tensorboard
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tensorboard==2.17.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+tensorboard-data-server==0.7.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorboard
+torch==2.4.0+rocm6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/torchatari/requirements.in
+types-protobuf==5.27.0.20240626
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   envpool
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   envpool
+    #   gymnasium
+    #   optree
+    #   reactivex
+    #   torch
+    #   tyro
+tyro==0.8.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.17
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/torchatari/requirements.in
+werkzeug==3.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorboard
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/torchatari/voirfile.py b/benchmarks/torchatari/voirfile.py
new file mode 100644
index 000000000..7b8873852
--- /dev/null
+++ b/benchmarks/torchatari/voirfile.py
@@ -0,0 +1,87 @@
+from dataclasses import dataclass
+
+from voir import configurable
+from voir.phase import StopProgram
+from benchmate.observer import BenchObserver
+from benchmate.monitor import voirfile_monitor
+
+
+@dataclass
+class Config:
+    """voir configuration"""
+
+    # Whether to display the dash or not
+    dash: bool = False
+
+    # How often to log the rates
+    interval: str = "1s"
+
+    # Number of rates to skip before logging
+    skip: int = 5
+
+    # Number of rates to log before stopping
+    stop: int = 20
+
+    # Number of seconds between each gpu poll
+    gpu_poll: int = 3
+
+
+@configurable
+def instrument_main(ov, options: Config):
+    yield ov.phases.init
+
+    # GPU monitor, rate, loss etc...
+    voirfile_monitor(ov, options)
+
+    yield ov.phases.load_script
+    
+    step_per_iteration = 0
+    
+    def fetch_args(args):
+        nonlocal step_per_iteration
+        step_per_iteration = args.num_envs * args.num_steps
+        return args
+        
+    def batch_size(x):
+        return step_per_iteration
+
+    observer = BenchObserver(
+        earlystop=options.stop + options.skip,
+        batch_size_fn=batch_size,
+    )
+    
+    probe = ov.probe("//main > args", overridable=True)
+    probe['args'].override(fetch_args)
+    
+    # measure the time it took to execute the body
+    probe = ov.probe("//main > iterations", overridable=True)
+    probe['iterations'].override(observer.loader)
+
+    # Too many losses
+    # probe = ov.probe("//main > loss", overridable=True)
+    # probe["loss"].override(observer.record_loss)
+
+    def record_starts(writer):
+        old_add_scalar = writer.add_scalar
+
+        def add_scalar(name, *values):
+            if name == "losses/value_loss":
+                observer.record_loss(values[0])
+            old_add_scalar(name, *values)
+        
+        writer.add_scalar = add_scalar
+        return writer
+
+    probe = ov.probe("//main > writer", overridable=True)
+    probe["writer"].override(record_starts)
+
+    probe = ov.probe("//main > optimizer", overridable=True)
+    probe['optimizer'].override(observer.optimizer)
+    
+    #
+    # Run the benchmark
+    #
+    try:
+        yield ov.phases.run_script
+    except StopProgram:
+        print("early stopped")
\ No newline at end of file
diff --git a/benchmarks/torchvision/requirements.cuda.txt b/benchmarks/torchvision/requirements.cuda.txt
index 0b1d03ae1..a24805963 100644
--- a/benchmarks/torchvision/requirements.cuda.txt
+++ b/benchmarks/torchvision/requirements.cuda.txt
@@ -30,7 +30,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -39,7 +39,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchcompat
@@ -110,6 +110,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -127,7 +131,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -147,11 +151,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
@@ -159,7 +159,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -167,7 +167,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -185,7 +185,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchvision/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchvision/requirements.in
diff --git a/benchmarks/torchvision/requirements.rocm.txt b/benchmarks/torchvision/requirements.rocm.txt
index 2f0b78222..094eb29b6 100644
--- a/benchmarks/torchvision/requirements.rocm.txt
+++ b/benchmarks/torchvision/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -30,7 +27,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
     #   torch
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -39,7 +36,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -75,7 +72,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -99,11 +96,11 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
@@ -119,11 +116,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision/requirements.in
@@ -133,11 +130,11 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/torchvision/requirements.in
-torchvision==0.18.1+rocm6.0
+torchvision==0.19.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision/requirements.in
diff --git a/benchmarks/torchvision/requirements.xpu.txt b/benchmarks/torchvision/requirements.xpu.txt
index 6503a0c9e..3cd876972 100644
--- a/benchmarks/torchvision/requirements.xpu.txt
+++ b/benchmarks/torchvision/requirements.xpu.txt
@@ -4,10 +4,9 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision/requirements.xpu.txt .pin/tmp-constraints-xpu-torchvision.txt benchmarks/torchvision/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,14 +16,6 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-certifi==2024.6.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
-charset-normalizer==3.3.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -46,10 +37,6 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-idna==3.7
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 importlib-resources==6.4.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -86,7 +73,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -110,7 +97,7 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   omegaconf
@@ -118,10 +105,6 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-requests==2.32.3
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   torchvision
 rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -130,11 +113,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
@@ -145,12 +128,12 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/torchvision/requirements.in
-torchvision==0.16.0.post2+cxx11.abi
+torchvision==0.19.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/torchvision/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/torchvision/requirements.in
@@ -159,10 +142,6 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   reactivex
     #   torch
-urllib3==1.26.19
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
diff --git a/benchmarks/torchvision_ddp/requirements.cuda.txt b/benchmarks/torchvision_ddp/requirements.cuda.txt
index 856b6c852..7c1971e1f 100644
--- a/benchmarks/torchvision_ddp/requirements.cuda.txt
+++ b/benchmarks/torchvision_ddp/requirements.cuda.txt
@@ -30,7 +30,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -39,7 +39,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchcompat
@@ -110,6 +110,10 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
@@ -127,7 +131,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-ovld==0.3.6
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -147,11 +151,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-cuda-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
@@ -159,7 +159,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.7.1
+rich==13.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -167,7 +167,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
-sympy==1.13.1
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -185,7 +185,7 @@ torchvision==0.19.0+cu121
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
diff --git a/benchmarks/torchvision_ddp/requirements.rocm.txt b/benchmarks/torchvision_ddp/requirements.rocm.txt
index 105c4a545..d1241db8b 100644
--- a/benchmarks/torchvision_ddp/requirements.rocm.txt
+++ b/benchmarks/torchvision_ddp/requirements.rocm.txt
@@ -4,10 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision_ddp/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision_ddp/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -30,7 +27,7 @@ filelock==3.15.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
     #   torch
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -39,7 +36,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -75,7 +72,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -99,11 +96,11 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pytorch-triton-rocm==2.3.1
+pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
@@ -119,11 +116,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.3.1+rocm6.0
+torch==2.4.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
@@ -133,11 +130,11 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchvision==0.18.1+rocm6.0
+torchvision==0.19.0+rocm6.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
diff --git a/benchmarks/torchvision_ddp/requirements.xpu.txt b/benchmarks/torchvision_ddp/requirements.xpu.txt
index a4a3f6220..b3f732e86 100644
--- a/benchmarks/torchvision_ddp/requirements.xpu.txt
+++ b/benchmarks/torchvision_ddp/requirements.xpu.txt
@@ -4,10 +4,8 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision_ddp/requirements.xpu.txt .pin/tmp-constraints-xpu-torchvision.txt benchmarks/torchvision_ddp/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -17,14 +15,6 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-certifi==2024.6.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
-charset-normalizer==3.3.2
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 codefind==0.1.6
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -46,10 +36,6 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   ptera
     #   voir
-idna==3.7
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 importlib-resources==6.4.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -86,7 +72,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.8
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
@@ -110,7 +96,7 @@ pynvml==11.5.3
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   omegaconf
@@ -118,10 +104,6 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   giving
-requests==2.32.3
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   torchvision
 rich==13.7.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
@@ -130,11 +112,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.1
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   torch
-torch==2.1.0.post2+cxx11.abi
+torch==2.4.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
@@ -145,12 +127,12 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchvision==0.16.0.post2+cxx11.abi
+torchvision==0.19.0+cpu
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -c .pin/../constraints/xpu.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
@@ -159,10 +141,6 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-xpu-torch.txt
     #   reactivex
     #   torch
-urllib3==1.26.19
-    # via
-    #   -c .pin/../.pin/constraints-xpu-torch.txt
-    #   requests
 varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-xpu-torch.txt
diff --git a/benchmate/benchmate/observer.py b/benchmate/benchmate/observer.py
index 9676e8261..0a40ae11e 100644
--- a/benchmate/benchmate/observer.py
+++ b/benchmate/benchmate/observer.py
@@ -75,10 +75,13 @@ def iterate(self, iterator, custom_step=False):
     def step(self):
         self.instance.step()
     
+    def original_dataloader(self):
+        return self.instance
+    
     def loader(self, loader, custom_step=False):
         """Wrap a dataloader or an iterable which enable accurate measuring of time spent in the loop's body"""
-        if self.instance:
-            return self.instance
+        if self.instance is not None:
+            return self.instance.loader
         
         cls = TimedIterator
         if custom_step:
diff --git a/config/base.yaml b/config/base.yaml
index 1d10341bb..3d02f33e6 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -106,7 +106,7 @@ _timm:
   argv:
     --amp: true
     --amp-dtype: bfloat16
-    --device: '{arch}'
+    --device: '{device_name}'
     --val-split: ''
     --data-dir: "{milabench_data}"
     --dataset: "FakeImageNet"
@@ -402,6 +402,7 @@ _diffusion:
     --num_epochs: 5
     --batch_size: 32
     --num_workers: "auto({n_worker}, 8)"
+    --cache: "{milabench_cache}"
 
 diffusion-single:
   inherits: _diffusion
@@ -414,6 +415,8 @@ diffusion-gpus:
   num_machines: 1
 
 diffusion-nodes:
+  tags:
+    - multinode
   inherits: _diffusion
   num_machines: 2
   requires_capabilities:
@@ -429,7 +432,7 @@ _lightning:
     --loader: pytorch
     --data: "{milabench_data}/FakeImageNet"
     --model: resnet152
-    --batch-size: 16
+    --batch-size: 256
 
 lightning:
   inherits: _lightning
@@ -463,7 +466,7 @@ dinov2-giant-single:
     method: per_gpu
 
   argv:
-    --config-file: src/dinov2/configs/train/vitg14.yaml
+    --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml"
     # THOSE NEED TO BE LAST
     train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
     train.batch_size_per_gpu=32: true
@@ -473,7 +476,7 @@ dinov2-giant-single:
 dinov2-giant-gpus:
   inherits: _dinov2
   argv:
-    --config-file: src/dinov2/configs/train/vitg14.yaml
+    --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml"
     # THOSE NEED TO BE LAST
     train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
     train.batch_size_per_gpu=32: true
@@ -481,12 +484,16 @@ dinov2-giant-gpus:
     train.num_workers=10: true
 
 dinov2-giant-nodes:
+  enabled: false
+  tags:
+    - multinode
+  max_duration: 3600
   inherits: _dinov2
   argv:
-    --config-file: src/dinov2/configs/train/vitg14.yaml
+    --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml"
     # THOSE NEED TO BE LAST
-    train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
-    train.batch_size_per_gpu=32: true
+    train.dataset_path=ImageFolder:root={milabench_data}/FakeImageNet: true
+    train.batch_size_per_gpu=12: true
     train.saveckp_freq=100: true
     train.num_workers=10: true
 
@@ -546,6 +553,9 @@ llm-lora-ddp-gpus:
 
 
 llm-lora-ddp-nodes:
+  tags:
+    - multinode
+  max_duration: 3600
   inherits: _llm
   plan:
     method: njobs
@@ -611,6 +621,9 @@ llm-full-mp-gpus:
 
 
 llm-full-mp-nodes:
+  tags:
+    - multinode
+  max_duration: 3600
   inherits: _llm
   plan:
     method: njobs
@@ -635,3 +648,52 @@ llm-full-mp-nodes:
     - "len(nodes) >= ${num_machines}"
 
 
+_geo_gnn:
+  inherits: _defaults
+  definition: .
+  # FIXME: torch cluster is laging behind pytorch
+  # we are forced to use torch==2.3 instead of torch==2.4
+  install_group: gnn
+  group: geo_gnn
+  definition: ../benchmarks/geo_gnn
+  plan:
+    method: per_gpu
+
+dimenet:
+  inherits: _geo_gnn
+  argv:
+    --model: 'DimeNet'
+    --num-samples: 10000
+    --use3d: True
+
+
+recursiongfn:
+  inherits: _defaults
+  definition: ../benchmarks/recursiongfn
+  install_group: gnn
+  group: recursiongfn_gnn
+  plan:
+    method: per_gpu
+
+  argv:
+    --batch_size: 128
+    --num_workers: 8
+    --num_steps: 100
+    --layer_width: 128
+    --num_layers: 4
+
+
+torchatari:
+  inherits: _defaults
+  definition: ../benchmarks/torchatari
+  install_group: torch
+  plan:
+    method: per_gpu
+
+  argv:
+    --num-minibatches: 16
+    --update-epochs: 4
+    --num-steps: 128
+    --num-envs: auto({cpu_per_gpu}, 128)
+    --total-timesteps: 1000000
+    --env-id: Breakout-v5
diff --git a/config/scaling.yaml b/config/scaling.yaml
index c6cf1bf6c..0a9907e5a 100644
--- a/config/scaling.yaml
+++ b/config/scaling.yaml
@@ -2,9 +2,10 @@ bert-fp16:
   arg: --batch-size
   model:
     1: 4108.75 MiB
+    2: 475.0 MiB
     4: 1840.375 MiB
     8: 8614.75 MiB
-    16: 14254.75 MiB
+    16: 475.0 MiB
     32: 24604.75 MiB
     40: 34157.9375 MiB
     64: 47212.375 MiB
@@ -16,9 +17,10 @@ bert-fp32:
   arg: --batch-size
   model:
     1: 4206.75 MiB
+    2: 475.0 MiB
     4: 6652.375 MiB
     8: 10240.75 MiB
-    16: 17646.75 MiB
+    16: 475.0 MiB
     24: 28007.9375 MiB
     32: 31568.75 MiB
     64: 61196.375 MiB
@@ -28,9 +30,10 @@ bert-tf32:
   arg: --batch-size
   model:
     1: 4204.75 MiB
+    2: 475.0 MiB
     4: 6654.375 MiB
     8: 10242.75 MiB
-    16: 17648.75 MiB
+    16: 475.0 MiB
     24: 28009.9375 MiB
     32: 31570.75 MiB
     64: 61198.375 MiB
@@ -40,9 +43,10 @@ bert-tf32-fp16:
   arg: --batch-size
   model:
     1: 4108.75 MiB
+    2: 475.0 MiB
     4: 1840.375 MiB
     8: 8614.75 MiB
-    16: 14254.75 MiB
+    16: 475.0 MiB
     32: 24604.75 MiB
     40: 34157.9375 MiB
     64: 47212.375 MiB
@@ -75,6 +79,7 @@ convnext_large-fp32:
   model:
     1: 3268.75 MiB
     2: 3480.375 MiB
+    4: 2060.75 MiB
     8: 5824.75 MiB
     16: 8774.75 MiB
     32: 14548.75 MiB
@@ -91,7 +96,7 @@ convnext_large-tf32:
     1: 3268.75 MiB
     2: 3480.375 MiB
     8: 5824.75 MiB
-    16: 8774.75 MiB
+    16: 1768.75 MiB
     32: 14548.75 MiB
     64: 26274.75 MiB
     72: 33081.9375 MiB
@@ -156,6 +161,37 @@ davit_large-multi:
     288: 65910.375 MiB
     328: 81742.75 MiB
   optimized: 128
+diffusion-gpus:
+  arg: --batch_size
+  model:
+    1: 23082 MiB
+    2: 21818.75 MiB
+    4: 23478.75 MiB
+    8: 26500.75 MiB
+    16: 36436.75 MiB
+    32: 57808 MiB
+    48: 80698 MiB
+  optimized: 32
+diffusion-nodes:
+  arg: --batch_size
+  model:
+    1: 21686.75 MiB
+    2: 21930.75 MiB
+    4: 23510.75 MiB
+diffusion-single:
+  arg: --batch_size
+  model:
+    1: 21654.75 MiB
+    2: 21818.75 MiB
+    4: 23478.75 MiB
+dimenet: {}
+dinov2-giant-gpus:
+  arg: train.batch_size_per_gpu={batch_size}
+  model:
+    32: 69614 MiB
+  optimized: 32
+dinov2-giant-single:
+  arg: train.batch_size_per_gpu={batch_size}
 dlrm: {}
 focalnet:
   arg: --batch-size
@@ -178,7 +214,37 @@ focalnet:
   optimized: 128
 fp16: {}
 fp32: {}
+lightning:
+  arg: --batch-size
+lightning-gpus:
+  arg: --batch-size
+  model:
+    1: 4542 MiB
+    2: 1158.75 MiB
+    4: 1156.75 MiB
+    8: 1260.75 MiB
+    16: 4150.75 MiB
+    128: 15858 MiB
+  optimized: 16
 llama: {}
+llm-full-mp-gpus:
+  arg: batch_size={batch_size}
+llm-full-mp-nodes:
+  arg: batch_size={batch_size}
+llm-lora-ddp-gpus:
+  arg: batch_size={batch_size}
+  model:
+    1: 12418.75 MiB
+llm-lora-ddp-nodes:
+  arg: batch_size={batch_size}
+llm-lora-mp-gpus:
+  arg: batch_size={batch_size}
+llm-lora-single:
+  arg: batch_size={batch_size}
+  model:
+    1: 23196.75 MiB
+    2: 27694.75 MiB
+    16: 45076.75 MiB
 opt-1_3b:
   arg: --per_gpu_batch_size
   model:
@@ -189,19 +255,25 @@ opt-1_3b-multinode:
   model:
     1: 42126 MiB
   optimized: 1
-opt-6_7b: {}
+opt-6_7b:
+  arg: --per_gpu_batch_size
 opt-6_7b-multinode:
   arg: --per_gpu_batch_size
   model:
     1: 55380 MiB
   optimized: 1
+recursiongfn:
+  arg: --batch_size
+  model:
+    2: 1134.75 MiB
+    4: 1140.75 MiB
 reformer:
   arg: --batch-size
   model:
     1: 1916.75 MiB
     4: 3004.375 MiB
     8: 4512.75 MiB
-    16: 7486.75 MiB
+    16: 7082.75 MiB
     24: 10470.75 MiB
     32: 13454.75 MiB
     64: 25408.75 MiB
@@ -215,9 +287,10 @@ regnet_y_128gf:
   arg: --batch-size
   model:
     1: 6876.75 MiB
+    2: 475.0 MiB
     4: 9062.375 MiB
     8: 8524.75 MiB
-    16: 11426.75 MiB
+    16: 1234.75 MiB
     24: 18523.9375 MiB
     32: 18324.75 MiB
     56: 31165.9375 MiB
@@ -248,7 +321,14 @@ resnet152:
     576: 58588.375 MiB
     640: 81354.75 MiB
   optimized: 128
-resnet152-ddp: {}
+resnet152-ddp:
+  arg: --batch-size
+resnet152-ddp-gpus:
+  arg: --batch-size
+  model:
+    1: 2084.75 MiB
+    2: 2122.75 MiB
+    4: 2260.75 MiB
 resnet152-multi:
   arg: --batch-size
   model:
@@ -294,7 +374,8 @@ resnet50:
     1552: 81146.75 MiB
     1560: 81590.75 MiB
   optimized: 64
-resnet50-noio: {}
+resnet50-noio:
+  arg: --batch-size
 rwkv:
   arg: --micro_bsz
   model:
@@ -317,9 +398,10 @@ super-slomo:
   arg: --train_batch_size
   model:
     1: 3016.75 MiB
+    2: 3506.75 MiB
     4: 5884.375 MiB
     8: 10288.75 MiB
-    16: 18718.75 MiB
+    16: 16914.75 MiB
     24: 29777.9375 MiB
     32: 33934.375 MiB
     56: 61837.9375 MiB
@@ -333,11 +415,17 @@ t5:
     2: 6384.375 MiB
     4: 10620.375 MiB
     8: 18684.75 MiB
-    16: 35448.75 MiB
+    16: 33990.75 MiB
     24: 54479.9375 MiB
     32: 66760.375 MiB
   optimized: 128
 tf32: {}
+torchatari:
+  arg: --num-steps
+  model:
+    1: 1124.75 MiB
+    2: 1138.75 MiB
+    4: 1166.75 MiB
 whisper:
   arg: --batch-size
   model:
@@ -354,28 +442,3 @@ whisper:
     128: 71634.375 MiB
     144: 80412.75 MiB
   optimized: 128
-
-
-diffusion-gpus:
-  arg: --batch_size
-  model:
-    1: 23082 MiB
-    16: 37778 MiB
-    32: 57808 MiB
-    48: 80698 MiB
-  optimized: 32
-
-
-lightning-gpus:
-  arg: --batch-size
-  model:
-    1: 4542 MiB
-    16: 5692 MiB
-    128: 15858 MiB
-  optimized: 16
-
-dinov2-giant-gpus:
-  arg: train.batch_size_per_gpu={batch_size}
-  model:
-    32: 69614 MiB
-  optimized: 32
diff --git a/constraints/extra/gnn.cuda.txt b/constraints/extra/gnn.cuda.txt
new file mode 100644
index 000000000..e5decec56
--- /dev/null
+++ b/constraints/extra/gnn.cuda.txt
@@ -0,0 +1,4 @@
+--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+
+torch>=2.3.0,<2.4.0
+
diff --git a/constraints/extra/gnn.hpu.txt b/constraints/extra/gnn.hpu.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/constraints/extra/gnn.rocm.txt b/constraints/extra/gnn.rocm.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/constraints/extra/gnn.xpu.txt b/constraints/extra/gnn.xpu.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/constraints/extra/torch.cuda.txt b/constraints/extra/torch.cuda.txt
new file mode 100644
index 000000000..aba504237
--- /dev/null
+++ b/constraints/extra/torch.cuda.txt
@@ -0,0 +1,2 @@
+jax[cuda12]
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
diff --git a/constraints/extra/torch.hpu.txt b/constraints/extra/torch.hpu.txt
new file mode 100644
index 000000000..1d21c1779
--- /dev/null
+++ b/constraints/extra/torch.hpu.txt
@@ -0,0 +1,5 @@
+
+#
+#
+voir >= 0.2.15
+torchcompat >= 1.0.0
diff --git a/constraints/extra/torch.rocm.txt b/constraints/extra/torch.rocm.txt
new file mode 100644
index 000000000..870d923a2
--- /dev/null
+++ b/constraints/extra/torch.rocm.txt
@@ -0,0 +1 @@
+# No jax only a container for it
diff --git a/constraints/extra/torch.xpu.txt b/constraints/extra/torch.xpu.txt
new file mode 100644
index 000000000..6b7454cbc
--- /dev/null
+++ b/constraints/extra/torch.xpu.txt
@@ -0,0 +1,20 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
+#
+# Including a package in a constraints file does not trigger installation of the package.
+#
+torch
+torchvision
+torchaudio
+intel-extension-for-pytorch
+oneccl_bind_pt
+intel-extension-for-pytorch-deepspeed
+
+# for jax as well
+intel-extension-for-openxla
+
+#
+#
+voir >= 0.2.15
+torchcompat >= 1.0.0
diff --git a/constraints/xpu.txt b/constraints/xpu.txt
index 37d21a00a..5aa7739a2 100644
--- a/constraints/xpu.txt
+++ b/constraints/xpu.txt
@@ -1,16 +1,16 @@
-# --extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 
 #
 # Including a package in a constraints file does not trigger installation of the package.
 #
-torch>=2.1.0
-torchvision>=0.16.0a0
-torchaudio>=2.1.0a0
-intel-extension-for-pytorch>=2.1.10+xpu
-oneccl_bind_pt==2.1.100+xpu
-intel-extension-for-pytorch-deepspeed>=2.1.30
-intel-extension-for-openxla>=0.3.0
+torch
+torchvision
+torchaudio
+intel-extension-for-pytorch
+oneccl_bind_pt
+intel-extension-for-pytorch-deepspeed
+intel-extension-for-openxla
 
 #
 #
diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst
new file mode 100644
index 000000000..8d40fc44d
--- /dev/null
+++ b/docs/execution_modes.rst
@@ -0,0 +1,93 @@
+Milabench processes overview
+============================
+
+* milabench main process
+  * gather metrics from benchmark processes, save them to file
+  * manages the benchmarks (timeout etc...)
+
+  * if ``per_gpu`` is used, milabench will launch one process per GPU (sets ``CUDA_VISIBLE_DEVCES``)
+    * each processes log their GPU data
+    * might spawn a monitor process
+      * will init pynvml
+    * dataloader will also spawn process workers
+      * usually not using GPU
+
+  * if ``njobs`` is used, milabench will launch a single process (torchrun)
+    * torchrun in turn will spawn one process per GPU
+      * RANK 0 is used for logging
+      * RANK 0 might spawn a monitor process
+        * will init pynvml
+      * dataloader will also spawn process workers 
+        * usually not using GPU
+
+Plan
+----
+
+per_gpu
++++++++
+
+``per_gpu``: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark
+
+.. code-block:: yaml
+
+   _torchvision:
+     inherits: _defaults
+     definition: ../benchmarks/torchvision
+     group: torchvision
+     install_group: torch
+     plan:
+       method: per_gpu
+
+Milabench will essentially execute something akin to below. 
+
+.. code-block:: bash
+
+   echo "---"
+   echo "fp16"
+   echo "===="
+   time (
+     CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+     wait
+   )
+
+njobs
++++++
+
+``njobs`` used to launch a single jobs that can see all the gpus.
+
+.. code-block:: yaml
+
+   _torchvision_ddp:
+     inherits: _defaults
+     definition: ../benchmarks/torchvision_ddp
+     group: torchvision
+     install_group: torch
+     plan:
+       method: njobs
+       n: 1
+
+Milabench will essentially execute something akin to below.
+
+.. code-block:: bash
+
+   echo "---"
+   echo "lightning-gpus"
+   echo "=============="
+   time (
+     $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+     wait
+   )
+
+
+
+
+
+
+
diff --git a/extra/torch_ppo_atari_envpool/mark_torch_ppo_atari_envpool b/extra/torch_ppo_atari_envpool/mark_torch_ppo_atari_envpool
new file mode 100644
index 000000000..e69de29bb
diff --git a/milabench/_version.py b/milabench/_version.py
index d24b7975c..6a9a689f2 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-34-g93521fd7"
-__commit__ = "93521fd70a02719076f64253ac4ae3b4a444c739"
-__date__ = "2024-08-22 19:02:01 +0000"
+__tag__ = "v0.1.0-51-g3d185d1"
+__commit__ = "3d185d15af22876b0dece6f296e179754b316a26"
+__date__ = "2024-08-28 11:52:25 -0400"
diff --git a/milabench/cli/__init__.py b/milabench/cli/__init__.py
index 205942e47..5a1f122c5 100644
--- a/milabench/cli/__init__.py
+++ b/milabench/cli/__init__.py
@@ -21,7 +21,8 @@
 from .summary import cli_summary
 from .resolve import cli_resolve
 from .new import cli_new
-
+from .env import cli_env
+from .prepare_run import cli_prepare_run
 
 class Main:
     def new():
@@ -94,6 +95,13 @@ def matrix():
 
     def resolve():
         return cli_resolve()
+    
+    def env():
+        """Print milabench environment variables"""
+        cli_env()
+
+    def prepare_run():
+        cli_prepare_run()
 
 
 def main(argv=None):
diff --git a/milabench/cli/dry.py b/milabench/cli/dry.py
index 80d55d6ea..010269223 100644
--- a/milabench/cli/dry.py
+++ b/milabench/cli/dry.py
@@ -169,7 +169,7 @@ def multipack_args(conf: Arguments):
                     "ip": f"192.168.0.{i + 10}" if i != 0 else "127.0.0.1",
                     "user": "username",
                     "main": i == 0,
-                    "port": 22,
+                    "sshport": 22,
                 }
                 for i in range(conf.nnodes)
             ],
diff --git a/milabench/cli/env.py b/milabench/cli/env.py
new file mode 100644
index 000000000..3725aa9df
--- /dev/null
+++ b/milabench/cli/env.py
@@ -0,0 +1,27 @@
+
+
+from milabench.system import _global_options, as_environment_variable, SystemConfig
+
+
+from dataclasses import asdict
+
+
+def cli_env():
+    _ = SystemConfig()
+
+    # import yaml
+    # print(yaml.dump(asdict(_)))
+
+    for k, option in _global_options.items():
+        env_name = as_environment_variable(k)
+        value = option["value"]
+        default = option["default"]
+
+        if value is None or value == default:
+            print("# ", end="")
+        
+        print(f"export {env_name}={value}")
+
+
+if __name__ == "__main__":
+    cli_env()
diff --git a/milabench/cli/install.py b/milabench/cli/install.py
index 00977aea3..10d33a1da 100644
--- a/milabench/cli/install.py
+++ b/milabench/cli/install.py
@@ -12,8 +12,10 @@
 @dataclass
 class Arguments:
     force: bool = False
+    update: bool = False
     shorttrace:  bool = False
     variant: str = None
+
 # fmt: on
 
 
@@ -22,13 +24,16 @@ def arguments():
     # Force install
     force: Option & bool = False
 
+    # Update package
+    update: Option & bool = False
+
     # On error show full stacktrace
     shorttrace: Option & bool = False
 
     # Install variant
     variant: Option & str = None
 
-    return Arguments(force, shorttrace, variant)
+    return Arguments(force, update, shorttrace, variant)
 
 
 @tooled
@@ -39,10 +44,13 @@ def cli_install(args=None):
 
     overrides = {"*": {"install_variant": args.variant}} if args.variant else {}
 
-    if args.force:
-        mp = get_multipack(run_name="install.{time}", overrides=overrides)
-        for pack in mp.packs.values():
+    
+    mp = get_multipack(run_name="install.{time}", overrides=overrides)
+    for pack in mp.packs.values():
+        if args.force or args.update:
             pack.install_mark_file.rm()
+        
+        if args.force:
             pack.dirs.venv.rm()
 
     mp = get_multipack(run_name="install.{time}", overrides=overrides)
diff --git a/milabench/cli/prepare_run.py b/milabench/cli/prepare_run.py
new file mode 100644
index 000000000..58b5fe559
--- /dev/null
+++ b/milabench/cli/prepare_run.py
@@ -0,0 +1,15 @@
+from coleo import tooled
+
+from .prepare import cli_prepare
+from .run import cli_run
+
+@tooled
+def cli_prepare_run(args=None):
+    """Prepare a benchmark: download datasets, weights etc."""
+    
+    rc = cli_prepare()
+    
+    if rc == 0:
+        rc = cli_run()
+
+    return rc
diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py
index db68dbf0e..35f1fe94e 100644
--- a/milabench/cli/slurm.py
+++ b/milabench/cli/slurm.py
@@ -1,23 +1,25 @@
 import getpass
 import os
-
+import socket
+import subprocess
 from coleo import tooled
 
-from ..system import get_gpu_capacity
+from ..system import get_gpu_capacity, is_loopback, resolve_hostname, gethostname
 
 
-@tooled
-def cli_slurm_system():
-    """Generate a system file based of slurm environment variables"""
 
-    node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", ""))
 
+def make_node_list_from_slurm(node_list):
     def make_node(i, ip):
+        hostname, local = resolve_hostname(ip)
+
         node = {
             "name": ip,
-            "ip": ip,
+            "ip": hostname,
+            "hostname": gethostname(ip),
             "user": getpass.getuser(),
-            "main": i == 0,
+            "main": local,
+            "sshport": 22,
         }
 
         if i == 0:
@@ -26,9 +28,46 @@ def make_node(i, ip):
         return node
 
     # nvidia-smi --query-gpu=memory.total --format=csv
+
+    nodes = [make_node(i, ip) for i, ip in enumerate(node_list)]
+
+    # ensure there is a main
+    # either it is the local node or first node
+    for node in nodes:
+        if node.get("main", False):
+            break
+    else:
+        nodes[0]["main"] = True
+
+    return nodes
+
+
+@tooled
+def cli_slurm_system():
+    """Generate a system file based of slurm environment variables"""
+
+    node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", ""))
+    
+    if len(node_list) > 0:
+        nodes = make_node_list_from_slurm(node_list)
+    else:
+        self = socket.gethostname()
+        nodes = [{
+            "name": self,
+            "ip": self,
+            "hostname": self,
+            "user": getpass.getuser(),
+            "main": True,
+            "sshport": 22,
+        }]
+
+
+    from milabench.system import resolve_addresses
+    resolve_addresses(nodes)
+
     system = {
         "arch": "cuda",
-        "nodes": [make_node(i, ip) for i, ip in enumerate(node_list)],
+        "nodes": nodes,
     }
 
     capacity = get_gpu_capacity()
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index bee42baf5..e6d3639d5 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -456,7 +456,7 @@ def is_local(self):
         if localnode is not None:
             return (False
                 # The ip belongs to the local node
-                or self.host in localnode["ipaddrlist"]
+                or self.host in localnode.get("ipaddrlist", [])
                 # The hostname is the local node
                 or self.host == localnode["hostname"]  
             )
@@ -485,7 +485,7 @@ def _argv(self, **kwargs) -> List:
             argv.append(f"-i{key}")
         argv.append(host)
 
-        return argv
+        return argv # + ["env", "-i"]
 
 
 class SCPCommand(SSHCommand, CmdCommand):
@@ -577,16 +577,24 @@ def _argv(self, **kwargs):
         return []
 
 
+def node_address(node):
+    """Favour Hostname as it is the most consistent name across machines"""
+    host = node.get("hostname")
+    ip = node.get("ip")
+    return host or ip
+
+
 class ForeachNode(ListCommand):
     def __init__(self, executor: Command, **kwargs) -> None:
         super().__init__(None, **kwargs)
         self.options.update(kwargs)
         self.executor = executor
+        self.base_tags = self.executor.pack.config["tag"]
 
     def make_new_node_pack(self, rank, node, base) -> "BasePackage":
         """Make a new environment/config for the run"""
         config = base.pack.config
-        tags = [*config["tag"], node["name"]]
+        tags = [*self.base_tags, node["name"]]
 
         # Workers do not send training data
         # tag it as such so validation can ignore this pack
@@ -630,10 +638,10 @@ def executors(self):
                 )
 
             worker = SSHCommand(
-                host=node["ip"],
+                host=node_address(node),
                 user=node["user"],
                 key=key,
-                port=node.get("port", 22),
+                port=node.get("sshport", 22),
                 executor=self.make_new_node_executor(rank, node, self.executor),
                 **options
             )
@@ -653,31 +661,43 @@ def copy(self, pack):
 
 class TorchrunAllNodes(ForeachNode):
     """executes torchrun on multiple machines"""
-    def __init__(self, executor: Command, **kwargs) -> None:
 
+    @staticmethod
+    def make_base_executor(cls, executor, *args, **kwargs):
         config = executor.pack.config
         max_num = config.get("num_machines", 1)
-        self.nodes = select_nodes(config["system"]["nodes"], max_num)
+        nodes = select_nodes(config["system"]["nodes"], max_num)
 
-        main = self.nodes[0]
+        main = nodes[0]
 
         # node[port] is for SSH
-        main_host = main["ip"]
+        main_host = node_address(main)
         # add them as option so we could tweak them if necessary
         main_port = option("torchrun.port", int, default=29400)
         backend = option("torchrun.backend", str, default="c10d")
 
         main_addr = f"{main_host}:{main_port}"
-        base_exec = TorchrunAllGPU(
+
+        config = executor.pack.config
+
+        return cls(
             executor,
-            f"--nnodes={len(self.nodes)}",
+            f"--nnodes={len(nodes)}",
             f"--rdzv-backend={backend}",
             f"--rdzv-endpoint={main_addr}",
-            f"--master-addr={main_host}",
-            f"--master-port={main_port}",
+            # f"--master-addr={main_host}",
+            # f"--master-port={main_port}",
+            *args,
             **kwargs
         )
 
+    def __init__(self, executor: Command, *args, **kwargs) -> None:
+        base_exec = TorchrunAllNodes.make_base_executor(
+            TorchrunAllGPU, 
+            executor,
+            *args, 
+            **kwargs
+        )
         super().__init__(base_exec)
 
 
@@ -852,7 +872,7 @@ def __init__(self, pack: pack.BasePackage, **kwargs):
         super().__init__(pack, **kwargs)
 
     def _argv(self, **_) -> List:
-        return [f"{self.pack.dirs.code / 'activator'}", f"{self.pack.dirs.venv}"]
+        return [activator_script(), f"{self.pack.dirs.venv}", f"{self.pack.dirs.cache}"]
 
 
 
@@ -874,9 +894,10 @@ def make_new_node_executor(self, rank, node, base):
         config = base.pack.config
 
         pack = self.make_new_node_pack(rank, node, base)
-    
+        executor = base.copy(pack)
+
         return DockerRunCommand(
-            AccelerateLaunchCommand(pack, rank=rank),
+            AccelerateLaunchCommand(executor, rank=rank, **self.options),
             config["system"].get("docker_image"),
         )
 
@@ -948,6 +969,8 @@ def _argv(self, **_) -> List:
             deepspeed_argv = []
     
         cpu_per_process = self.pack.resolve_argument('--cpus_per_gpu', 4)
+        main_port = option("torchrun.port", int, default=29400)
+
         return [
             # -- Run the command in the right venv
             # This could be inside the SSH Command
@@ -956,6 +979,7 @@ def _argv(self, **_) -> List:
             # inside a specifc venv
             activator_script(),
             f"{self.pack.dirs.venv}",
+            f"{self.pack.dirs.cache}",
             # --
             "accelerate",
             "launch",
@@ -967,7 +991,7 @@ def _argv(self, **_) -> List:
             f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}",
             f"--num_cpu_threads_per_process={cpu_per_process}",
             f"--main_process_ip={manager['ip']}",
-            f"--main_process_port={manager['port']}",
+            f"--main_process_port={main_port}",
             f"--num_processes={nproc}",
             *self.accelerate_argv,
         ]
diff --git a/milabench/common.py b/milabench/common.py
index 5849e05fe..135e45545 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -141,7 +141,7 @@ def get_base_defaults(base, arch="none", run_name="none"):
                     {
                         "name": "local",
                         "ip": "127.0.0.1",
-                        "port": 8123,
+                        "sshport": 22,
                         "user": user,
                         "main": True,
                     }
diff --git a/milabench/multi.py b/milabench/multi.py
index b09eeecca..f734e40d5 100644
--- a/milabench/multi.py
+++ b/milabench/multi.py
@@ -83,6 +83,23 @@ def make_execution_plan(pack, step=0, repeat=1):
 
     return exec_plan
 
+async def copy_base_to_workers(setup):
+    # Note: when we use docker we do not need to install
+    # so this should be ignored
+    if is_main_local(setup) and is_multinode(setup):
+        print("Coping main setup from this node to worker")
+        # copy the main setup to the workers
+        # so it copies the bench venv already, no need for python
+        from milabench.remote import copy_folder
+        from milabench.system import SystemConfig
+
+        # we copy the entire content of base
+        #   FIXME: handle custom (venv, cache, data, etc...) directories
+        #  
+        copy_plan = copy_folder(setup, SystemConfig().base)
+        remote_task = asyncio.create_task(copy_plan.execute())
+        await asyncio.wait([remote_task])
+
 
 class MultiPackage:
     def __init__(self, packs):
@@ -140,6 +157,7 @@ async def do_install(self):
         remote_task = None
 
         if is_remote(setup):
+            print("Current node is outside of our system")
             # We are outside system, setup the main node first
             remote_plan = milabench_remote_install(setup, setup_for="main")
             remote_task = asyncio.create_task(remote_plan.execute())
@@ -148,15 +166,18 @@ async def do_install(self):
             # We do not install benchmarks on that node
             return
 
-        elif is_main_local(setup) and is_multinode(setup):
-            # We are the main node, setup workers
-            remote_plan = milabench_remote_install(setup, setup_for="worker")
-            remote_task = asyncio.create_task(remote_plan.execute())
+        # elif is_main_local(setup) and is_multinode(setup):
+        #     # this was executing install on the remote node but then it needed python to be available
+        #     # We are the main node, setup workers
+        #     remote_plan = milabench_remote_install(setup, setup_for="worker")
+        #     remote_task = asyncio.create_task(remote_plan.execute())
 
         # do the installation step
         with phase_lock("install"):
             await self.do_phase("install", remote_task, "checked_install")
 
+        await copy_base_to_workers(setup)
+
     async def do_prepare(self):
         setup = self.setup_pack()
         remote_task = None
@@ -168,13 +189,17 @@ async def do_prepare(self):
 
             return
 
-        elif is_main_local(setup) and is_multinode(setup):
-            remote_plan = milabench_remote_prepare(setup, run_for="worker")
-            remote_task = asyncio.create_task(remote_plan.execute())
+        # elif is_main_local(setup) and is_multinode(setup):
+        #     remote_plan = milabench_remote_prepare(setup, run_for="worker")
+        #     remote_task = asyncio.create_task(remote_plan.execute())
 
         with phase_lock("prepare"):
             await self.do_phase("prepare", remote_task, "prepare")
 
+        # Prepare is done on the main node
+        # copy the result there
+        await copy_base_to_workers(setup)
+
     async def do_run(self, repeat=1):
         setup = self.setup_pack()
 
@@ -207,7 +232,7 @@ async def do_run(self, repeat=1):
                     await pack.message_error(exc)
 
     async def do_pin(
-        self, pip_compile_args, constraints: list = tuple(), from_scratch=False
+        self, pip_compile_args, constraints: list = tuple(), from_scratch=False, requirements: list = tuple()
     ):
         groups = defaultdict(dict)
         for pack in self.packs.values():
@@ -215,11 +240,13 @@ async def do_pin(
             igrp = pack.config["install_group"]
             ivar = pack.config["install_variant"]
             ivar_constraints: XPath = here.parent / "constraints" / f"{ivar}.txt"
+            
             base_reqs = pack.requirements_map().keys()
             if ivar_constraints.exists():
                 constraints = {ivar_constraints, *constraints}
-            groups[igrp].update({req: pack for req in base_reqs})
 
+            groups[igrp].update({req: pack for req in base_reqs})
+            
         for constraint in constraints:
             print("Using constraint file:", constraint)
 
@@ -231,19 +258,28 @@ async def do_pin(
         for ig, (reqs, packs) in groups.items():
             if len(packs) < len(reqs):
                 if len(set(p.config["group"] for p in packs)) > 1:
-                    raise Exception(
-                        f"Install group '{ig}' contains benchmarks that have more than"
+                    print(
+                        f"WARNING: Install group '{ig}' contains benchmarks that have more than"
                         " one requirements file. Please isolate such benchmarks in their"
                         " own install_group."
                     )
 
         for ig, (reqs, packs) in groups.items():
             packs = list(packs)
+            pack0 = packs[0]
+
+            ivar = pack.config["install_variant"]
+            ivar_requirements: XPath = here.parent / "constraints" / "extra" / f"{ig}.{ivar}.txt"
+
+            if ivar_requirements.exists():
+                reqs.add(ivar_requirements)
+            
             if len(packs) == 1:
                 (pack,) = packs
                 await pack.pin(
                     pip_compile_args=pip_compile_args,
                     constraints=constraints,
+                    requirements=requirements
                 )
             else:
                 pack0 = packs[0]
@@ -253,7 +289,7 @@ async def do_pin(
 
                 constraint_path = pindir / "tmp-constraints.txt"
                 constraint_files = make_constraints_file(
-                    constraint_path, constraints, str(here.parent)
+                    constraint_path, constraints, str(here.parent), requirements=requirements
                 )
 
                 ig_constraint_path = pindir / f"constraints-{ivar}-{ig}.txt"
@@ -278,6 +314,7 @@ async def do_pin(
                         pip_compile_args=pip_compile_args,
                         constraints=new_constraints,
                         working_dir=here.parent,
+                        requirements=requirements
                     )
 
     async def count_runs(self, repeat):
diff --git a/milabench/pack.py b/milabench/pack.py
index 60a5df2f7..b557b7729 100644
--- a/milabench/pack.py
+++ b/milabench/pack.py
@@ -398,6 +398,7 @@ async def pin(
         input_files: Sequence = tuple(),
         constraints: Sequence = tuple(),
         working_dir=None,
+        requirements: Sequence = tuple(),
     ):
         """Pin versions to requirements file.
 
@@ -407,6 +408,9 @@ async def pin(
             input_files: A list of inputs to piptools compile
             constraint: The constraint file
         """
+        if working_dir is None:
+            working_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    
         ivar = self.config.get("install_variant", None)
 
         if ivar == "unpinned":
@@ -426,7 +430,10 @@ async def pin(
             grp = self.config["group"]
             constraint_path = XPath(".pin") / f"tmp-constraints-{ivar}-{grp}.txt"
             constraint_files = make_constraints_file(
-                constraint_path, constraints, working_dir
+                constraint_path, 
+                constraints, 
+                working_dir,
+                requirements=requirements,
             )
             current_input_files = constraint_files + (base_reqs, *input_files)
 
diff --git a/milabench/remote.py b/milabench/remote.py
index bf5963183..7e1eef85c 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -70,9 +70,48 @@ def milabench_remote_sync(pack, worker):
 
 def should_run_for(worker, setup_for):
     if setup_for == "worker":
-        return not worker["main"]
+        return not worker.get("main", False)
+
+    return worker.get("main", False)
+
+
+def worker_commands(pack, worker_plan, setup_for="worker"):
+    nodes = pack.config["system"]["nodes"]
+    copy = []
+    node_packs = []
+
+    for node in nodes:
+        node_pack = None
+
+        if should_run_for(node, setup_for):
+            node_pack = worker_pack(pack, node)
+
+            cmds = worker_plan(node_pack, node)
+
+            if not isinstance(cmds, list):
+                cmds = [cmds]
+            copy.extend(cmds)
+
+        node_packs.append(node_pack)
+
+    return ListCommand(*copy)
+
+
+def sshnode(node, cmd):
+    host = node["ip"]
+    user = node["user"]
+    port = node["sshport"]
+    return SSHCommand(cmd, user=user, host=host, port=port)
+
+
+def copy_folder(pack, folder, setup_for="worker"):
+    def copy_to_worker(nodepack, node):
+        return [
+             sshnode(node, CmdCommand(nodepack, "mkdir", "-p", folder)),
+             CmdCommand(nodepack, *rsync(node, folder))
+        ]
+    return worker_commands(pack, copy_to_worker, setup_for=setup_for)
 
-    return worker["main"]
 
 
 def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
@@ -87,22 +126,16 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
     copy = []
     node_packs = []
 
-    for node in nodes:
-        node_pack = None
-
-        if should_run_for(node, setup_for):
-            node_pack = worker_pack(pack, node)
-            copy.append(CmdCommand(node_pack, *rsync(node, INSTALL_FOLDER)))
-
-        node_packs.append(node_pack)
+    copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for)
 
     install = []
+
     for i, node in enumerate(nodes):
         if should_run_for(node, setup_for):
             install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER))
 
     return SequenceCommand(
-        ListCommand(*copy),
+        copy_source,
         ListCommand(*install),
     )
 
@@ -146,7 +179,7 @@ def is_multinode(pack):
     count = 0
     nodes = pack.config["system"]["nodes"]
     for node in nodes:
-        if not node["main"]:
+        if not node.get("main", False):
             count += 1
     return count > 0
 
@@ -159,12 +192,12 @@ def is_remote(pack):
 def is_main_local(pack):
     """Only the local main can send remote commands to remote"""
     self = pack.config["system"]["self"]
-    return self is not None and self["local"] and self["main"]
+    return self is not None and self["local"] and self.get("main", False)
 
 
 def is_worker(pack):
     self = pack.config["system"]["self"]
-    return self is not None and (not self["main"])
+    return self is not None and (not self.get("main", False))
 
 
 def _sanity(pack, setup_for):
diff --git a/milabench/scripts/activator b/milabench/scripts/activator
index 083c28cb1..3ea5b3c86 100755
--- a/milabench/scripts/activator
+++ b/milabench/scripts/activator
@@ -3,5 +3,11 @@
 venv="$1"
 shift
 
+cache="$1"
+shift
+
+echo "$cache"
+export XDG_CACHE_HOME=$cache
+
 source "$venv"/bin/activate
 exec "$@"
diff --git a/milabench/sizer.py b/milabench/sizer.py
index 2ae877213..b3fa40478 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -261,20 +261,37 @@ def on_start(self, entry):
         self.max_usage = float("-inf")
 
         config = self.memory.setdefault(self.benchname, dict())
-        scalingarg = config.get("arg", None)
+        template = config.get("arg", None)
 
-        if scalingarg is None:
+        if template is None:
             self.benchname = None
             return
+        
+        placeholder = "{batch_size}" 
+        argstart = template.replace(placeholder, "")
 
+        is_template = False
         found = None
         for i, arg in enumerate(argv):
-            if arg.endswith(scalingarg):
+            if arg.endswith(template):
+                found = i
+                break
+            
+            # 
+            if arg.startswith(argstart):
                 found = i
+                is_template = True
                 break
 
         if found:
-            self.batch_size = int(argv[found + 1])
+            if is_template:
+                arg = argv[found]
+                value = arg.replace(argstart, "")
+                self.batch_size = int(value)
+            else:
+                self.batch_size = int(argv[found + 1])
+        else:
+            print("Count not find batch_size argument")
 
     def on_data(self, entry):
         if self.filepath is None:
@@ -331,6 +348,23 @@ def report(self, *args):
                 yaml.dump(newdata, file)
 
 
+def arch_to_device(arch):
+    device_types = [
+        "cpu", 
+        "cuda", 
+        "ipu", 
+        "xpu", 
+        "mkldnn", 
+        "opengl", "opencl", "ideep", "hip", "ve", 
+        "fpga", "maia", "xla", "lazy", "vulkan", "mps", "meta",
+        "hpu", "mtia", "privateuseone"
+    ]
+    arch_to_device = {t:t for t in device_types}
+    arch_to_device["rocm"] = "cuda"
+    return arch_to_device.get(arch, "cpu")
+
+
+
 def new_argument_resolver(pack):
     system_config = system_global.get()
     if system_config is None:
@@ -339,16 +373,17 @@ def new_argument_resolver(pack):
     context = deepcopy(system_config)
 
     arch = context.get("arch", "cpu")
+    device_count_used = 1
+    device_count_system = len(get_gpu_info()["gpus"])
 
     if hasattr(pack, "config"):
-        device_count = len(pack.config.get("devices", [0]))
-    else:
-        device_count = len(get_gpu_info()["gpus"])
+        device_count_used = len(pack.config.get("devices", [0]))
+
+    if device_count_used <= 0:
+        device_count_used = 1
 
     ccl = {"hpu": "hccl", "cuda": "nccl", "rocm": "rccl", "xpu": "ccl", "cpu": "gloo"}
 
-    if device_count <= 0:
-        device_count = 1
 
     cpu_opt = CPUOptions()
     def auto(value, default):
@@ -363,13 +398,14 @@ def clamp(x, mn=cpu_opt.cpu_min, mx=cpu_opt.cpu_max):
     total_available = total_cpu - cpu_opt.reserved_cores
 
     context["cpu_count"] = total_available
-    context["cpu_per_gpu"] = total_available // device_count
+    context["cpu_per_gpu"] = total_available // max(device_count_system, 1)
     context["n_worker"] = clamp(context["cpu_per_gpu"])
 
     if cpu_opt.n_workers is not None:
         context["n_worker"] = cpu_opt.n_workers
 
     context["arch"] = arch
+    context["device_name"] = arch_to_device(arch)
     context["ccl"] = ccl.get(arch, "gloo")
     
     context["milabench_base"] = option("base", str, default="")
@@ -381,6 +417,7 @@ def clamp(x, mn=cpu_opt.cpu_min, mx=cpu_opt.cpu_max):
     context["milabench_runs"] = dirs.get('runs', "")
     context["milabench_cache"] = dirs.get('cache', "")
     context["milabench_name"] = pack.config.get("name", None)
+    context["benchmark_folder"] = pack.config.get('definition', None)
 
     def auto_eval(arg):
         newvalue = str(arg).format(**context)
diff --git a/milabench/system.py b/milabench/system.py
index 7db61e5ea..d29f4cd27 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -3,7 +3,10 @@
 import socket
 from dataclasses import dataclass, field
 import sys
+import subprocess
 from contextlib import contextmanager
+import ipaddress
+
 import psutil
 import yaml
 from voir.instruments.gpu import get_gpu_info
@@ -14,6 +17,21 @@
 system_global = contextvars.ContextVar("system", default=None)
 
 
+def get_gpu_capacity(strict=False):
+    try:
+        capacity = 1e24
+
+        for k, v in get_gpu_info()["gpus"].items():
+            capacity = min(v["memory"]["total"], capacity)
+
+        return int(capacity)
+    except:
+        print("GPU not available, defaulting to 0 MiB")
+        if strict:
+            raise
+        return 0
+
+
 def getenv(name, expected_type):
     value = os.getenv(name)
 
@@ -66,8 +84,6 @@ def option(name, etype, default=None):
     system = system_global.get()
     if system:
         options = system.get("options", dict())
-    else:
-        warn_no_config()
 
     frags = name.split(".")
     env_name = as_environment_variable(name)
@@ -124,7 +140,7 @@ class SizerOptions:
     optimized: bool = defaultfield("sizer.optimized", int)
 
     # Set a target VRAM capacity to use
-    capacity: str = defaultfield("sizer.capacity", str)
+    capacity: str = defaultfield("sizer.capacity", str, None)
 
     # Save the batch size, VRM usage data to a scaling file
     save: str = defaultfield("sizer.save", str, None)
@@ -177,17 +193,17 @@ class Torchrun:
 
 @dataclass
 class Options:
-    sizer: SizerOptions
-    cpu: CPUOptions
-    dataset: DatasetConfig
-    dirs: Dirs
-    torchrun: Torchrun
+    sizer: SizerOptions = SizerOptions()
+    cpu: CPUOptions = CPUOptions() 
+    dataset: DatasetConfig = DatasetConfig()
+    dirs: Dirs = Dirs()
+    torchrun: Torchrun = Torchrun()
 
 
 @dataclass
 class GPUConfig:
     arch: str = defaultfield("gpu.arch", str, None)
-    capacity: str = None
+    capacity: str = defaultfield("gpu.capacity", str, str(get_gpu_capacity()))
 
 
 @dataclass
@@ -204,21 +220,29 @@ class Github:
     pat: str = defaultfield("github.path", str, None)
 
 
+def default_device():
+    try:
+        gpu_info = get_gpu_info()
+        return gpu_info["arch"]
+    except:
+        return "cpu"
+
+
 @dataclass
 class SystemConfig:
     """This is meant to be an exhaustive list of all the environment overrides"""
-    arch: str = defaultfield("gpu.arch", str, None)
-    sshkey: str = None
+    arch: str = defaultfield("gpu.arch", str, default_device())
+    sshkey: str = defaultfield("ssh", str, "~/.ssh/id_rsa")
     docker_image: str = None
     nodes: list[Nodes] = field(default_factory=list)
-    gpu: GPUConfig = None
-    options: Options = None
+    gpu: GPUConfig = GPUConfig()
+    options: Options = Options()
 
     base: str = defaultfield("base", str, None)
     config: str = defaultfield("config", str, None)
     dash: bool = defaultfield("dash", bool, 1)
     noterm: bool = defaultfield("noterm", bool, 0)
-    github: Github = None
+    github: Github = Github()
 
 
 def check_node_config(nodes):
@@ -249,6 +273,18 @@ def get_remote_ip():
     return set(result)
 
 
+def is_loopback(address: str) -> bool:
+    try:
+        # Create an IP address object
+        ip = ipaddress.ip_address(address)
+        # Check if the address is a loopback address
+        return ip.is_loopback
+    except ValueError:
+        # If the address is invalid, return False
+        return False
+
+
+
 def _resolve_ip(ip):
     hostname = ip
     aliaslist = []
@@ -304,7 +340,7 @@ def enable_offline(enabled):
     offline = old
 
 
-def resolve_addresses(nodes):
+def _resolve_addresses(nodes):
     # Note: it is possible for self to be none
     # if we are running milabench on a node that is not part of the system
     # in that case it should still work; the local is then going to
@@ -327,12 +363,14 @@ def resolve_addresses(nodes):
             or (hostname in ("localhost", socket.gethostname(), "127.0.0.1"))
             or (socket.gethostname().startswith(hostname))
             or len(ip_list.intersection(ipaddrlist)) > 0
+            or any([is_loopback(ip) for ip in ipaddrlist])
         )
+
         # cn-g005 cn-g005.server.mila.quebec
         # print(hostname, socket.gethostname())
         node["local"] = is_local
 
-        if is_local:
+        if is_local and self is None:
             self = node
             node["ipaddrlist"] = list(set(list(ip_list) + list(ipaddrlist)))
 
@@ -345,19 +383,64 @@ def resolve_addresses(nodes):
     return self
 
 
-def get_gpu_capacity(strict=False):
+def gethostname(host):
     try:
-        capacity = 0
+        #             "-oCheckHostIP=no",
+        # "-oPasswordAuthentication=no",
+        return subprocess.check_output([
+            "ssh",  
+            "-oCheckHostIP=no", 
+            "-oPasswordAuthentication=no", 
+            "-oStrictHostKeyChecking=no", host, "cat", "/etc/hostname"], text=True).strip()
+    except:
+        print("Could not resolve hostname")
+        return host
 
-        for k, v in get_gpu_info()["gpus"].items():
-            capacity = min(v["memory"]["total"], capacity)
 
-        return int(capacity)
+def resolve_hostname(ip):
+    try:
+        hostname, _, iplist = socket.gethostbyaddr(ip)
+
+        for ip in iplist:
+            if is_loopback(ip):
+                return hostname, True
+
+        return hostname, hostname == socket.gethostname()
+
     except:
-        print("GPU not available, defaulting to 0 MiB")
-        if strict:
-            raise
-        return 0
+        if offline:
+            return ip, False
+
+        raise
+
+def resolve_node_address(node):
+    hostname, local = resolve_hostname(node["ip"])
+
+    node["hostname"] = hostname
+    node["local"] = local
+
+    if local:
+        # `gethostbyaddr` returns `cn-d003` but we want `cn-d003.server.mila.quebec`
+        # else torchrun does not recognize the main node
+        node["hostname"] = socket.gethostname()
+        
+    return local
+
+
+def resolve_addresses(nodes):
+    if offline:
+        for n in nodes:
+            n["hostname"] = n["ip"]
+    
+        return nodes[0]
+
+    self = None
+    
+    for node in nodes:
+        if resolve_node_address(node):
+            self = node
+
+    return self
 
 
 def build_system_config(config_file, defaults=None, gpu=True):
diff --git a/milabench/utils.py b/milabench/utils.py
index 2e732200d..8495d117e 100644
--- a/milabench/utils.py
+++ b/milabench/utils.py
@@ -114,7 +114,7 @@ def relativize(pth, working_dir):
         return pth
 
 
-def make_constraints_file(pth, constraints, working_dir):
+def make_constraints_file(pth, constraints, working_dir, requirements=tuple()):
     if constraints:
         constraint_file = XPath(working_dir) / XPath(pth)
         os.makedirs(constraint_file.parent, exist_ok=True)
@@ -122,7 +122,10 @@ def make_constraints_file(pth, constraints, working_dir):
             # We prefix the constraint with ../ because we are creating a constraint
             # file in ./.pin/,but containing constraints with paths relative to ./
             tfile.write(
-                "\n".join([f"-c ../{relativize(c, working_dir)}" for c in constraints])
+                "\n".join([f"-c ../{relativize(c, working_dir)}" for c in constraints]) + "\n"
+            )
+            tfile.write(
+                "\n".join([f"-r ../{relativize(r, working_dir)}" for r in requirements]) + "\n"
             )
         return (constraint_file,)
     else:
@@ -231,7 +234,7 @@ def select_nodes(nodes, n):
     ranked = []
 
     for node in nodes:
-        if node["main"]:
+        if node.get("main", False):
             ranked.insert(0, node)
         else:
             ranked.append(node)
@@ -242,7 +245,7 @@ def select_nodes(nodes, n):
 def enumerate_rank(nodes):
     rank = 1
     for node in nodes:
-        if node["main"]:
+        if node.get("main", False):
             yield 0, node
         else:
             yield rank, node
diff --git a/poetry.lock b/poetry.lock
index 037d00a5c..1276e7e8f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -682,17 +682,17 @@ test = ["objgraph", "psutil"]
 
 [[package]]
 name = "hrepr"
-version = "0.4.1"
+version = "0.7.3"
 description = "Extensible HTML representation for Python objects."
 optional = false
-python-versions = ">=3.6,<4.0"
+python-versions = ">=3.9"
 files = [
-    {file = "hrepr-0.4.1-py3-none-any.whl", hash = "sha256:b1a010a8be820cbc2aba41863831985001319961cb303a59134472ec9df5972a"},
-    {file = "hrepr-0.4.1.tar.gz", hash = "sha256:52c2d379c08992f236a5004a8cb86716f7ccf8fb367b043af3704ffc97d04bc4"},
+    {file = "hrepr-0.7.3-py3-none-any.whl", hash = "sha256:ad6ce531ee97ed280d79a3235a3b67008ecd4cdd921941c097ce1fbb8912ffd1"},
+    {file = "hrepr-0.7.3.tar.gz", hash = "sha256:9b0f8480d0bec912dd16b8f06d7008c9bfd9408508df81465703aab4c35024a8"},
 ]
 
 [package.dependencies]
-ovld = ">=0.3.2,<0.4.0"
+ovld = ">=0.3.6,<0.4.0"
 
 [[package]]
 name = "idna"
@@ -1026,13 +1026,13 @@ PyYAML = ">=5.1.0"
 
 [[package]]
 name = "ovld"
-version = "0.3.5"
+version = "0.3.9"
 description = "Overloading Python functions"
 optional = false
-python-versions = "<4.0,>=3.8"
+python-versions = ">=3.8"
 files = [
-    {file = "ovld-0.3.5-py3-none-any.whl", hash = "sha256:d36604a9ff7202d5639ebefd6ff97955ce5b04ffff0c7f0ade6ddc3189ca9846"},
-    {file = "ovld-0.3.5.tar.gz", hash = "sha256:838358bc800d5bf3a66afcd6d59f0826eda7a598f48f885a9c8662169ef29813"},
+    {file = "ovld-0.3.9-py3-none-any.whl", hash = "sha256:41c9c6555dc7749f71a020dcbc335dd834585876bfbb09d27fd9a5be40bb6e57"},
+    {file = "ovld-0.3.9.tar.gz", hash = "sha256:ef7eda584f62266fb3260345a91f0d888b938652fc790f3a95b349237e262f0b"},
 ]
 
 [[package]]
@@ -1450,17 +1450,6 @@ snappy = ["python-snappy"]
 test = ["pytest (>=7)"]
 zstd = ["zstandard"]
 
-[[package]]
-name = "pynvml"
-version = "11.5.3"
-description = "Python utilities for the NVIDIA Management Library"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pynvml-11.5.3-py3-none-any.whl", hash = "sha256:a5fba3ab14febda50d19dbda012ef62ae0aed45b7ccc07af0bc5be79223e450c"},
-    {file = "pynvml-11.5.3.tar.gz", hash = "sha256:183d223ae487e5f00402d8da06c68c978ef8a9295793ee75559839c6ade7b229"},
-]
-
 [[package]]
 name = "pyproject-hooks"
 version = "1.1.0"
@@ -2201,4 +2190,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "c7d08e99853b23573817ead28e8e40883529dd95f0646a35ed1eed96daf4e2b9"
+content-hash = "b0283769e6ab814b9c62b13d6dc68f01dbc27156b8d0cb0f03f1490aaaf384e6"
diff --git a/pyproject.toml b/pyproject.toml
index 802ce02ff..0d4a6d62d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,14 +20,11 @@ requests = "^2.26.0"
 nox = "^2021.10.1"
 GitPython = "^3.1.24"
 PyYAML = "^6.0"
-ovld = "^0.3.2"
-hrepr = "^0.4.0"
 blessed = "^1.19.1"
 pathspec = "^0.9.0"
 cp-template = "^0.3.0"
 pandas = ">=1.4.2"
 numpy = ">=1.23.0,<2.0.0"
-pynvml = "^11.4.1"
 tqdm = "^4.64.1"
 pip-tools = "^7.4.1"
 rich = "^13.3.2"
@@ -39,6 +36,7 @@ py-cpuinfo = "^9.0.0"
 psutil = "^5.9.5"
 importlib-resources = "^6.1.0"
 filelock = "^3.15.3"
+hrepr = ">=0.7.0"
 
 [tool.poetry.group.dev.dependencies]
 black = ">=21.10b0"
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index 405d7e3fd..c8c151d80 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -31,7 +31,7 @@ install_prepare() {
 
     if [ -z "${MILABENCH_SOURCE}" ]; then
         if [ ! -d "$MILABENCH_WORDIR/milabench" ]; then
-            git clone https://github.com/mila-iqia/milabench.git
+            git clone https://github.com/mila-iqia/milabench.git -b staging
         fi
         export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench"
     fi
@@ -40,10 +40,12 @@ install_prepare() {
 
     pip install -e $MILABENCH_SOURCE
 
+    milabench slurm_system > $MILABENCH_WORDIR/system.yaml
+
     #
     # Install milabench's benchmarks in their venv
     #
-    milabench install $ARGS
+    milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     which pip
 
@@ -60,7 +62,7 @@ install_prepare() {
 
     #
     #   Generate/download datasets, download models etc...
-    milabench prepare $ARGS
+    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 }
 
 module load cuda/12.3.2
@@ -78,7 +80,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
 
     #
     #   Run the benchmakrs
-    milabench run "$@"
+    milabench run --system $MILABENCH_WORDIR/system.yaml "$@"
 
     #
     #   Display report
diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh
index 7980d41d4..c21730b7a 100644
--- a/scripts/article/run_cuda_dev.sh
+++ b/scripts/article/run_cuda_dev.sh
@@ -3,9 +3,16 @@
 set -ex
 
 # export MILABENCH_SOURCE=$HOME/milabench
+#
+# # put those on the shared drived 
+# export MILABENCH_DIRS_DATA=/home/mila/d/delaunap/scratch/milabench/data
+# export MILABENCH_DIRS_VENV=/home/mila/d/delaunap/scratch/milabench/venv
+# export MILABENCH_DIRS_RUNS=/home/mila/d/delaunap/scratch/milabench/runs
+#
+#
 # mkdir /tmp/workspace && cd /tmp/workspace
 # conda activate py310
-#
+# bash $HOME/milabench/scripts/article/run_cuda_dev.sh
 #
 
 export MILABENCH_GPU_ARCH=cuda
@@ -14,8 +21,13 @@ export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH"
 export MILABENCH_BASE="$MILABENCH_WORDIR/results"
 export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
-export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
+export MILABENCH_SYSTEM="$MILABENCH_WORDIR/system.yaml"
 
+if [ -z "${MILABENCH_DIRS_VENV}" ]; then
+    export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
+else
+    export BENCHMARK_VENV="$MILABENCH_DIRS_VENV/"'${install_group}'
+fi
 
 if [ -z "${MILABENCH_PREPARE}" ]; then
     export MILABENCH_PREPARE=0
@@ -51,20 +63,25 @@ install_prepare() {
     . $MILABENCH_WORDIR/env/bin/activate
     pip install -e $MILABENCH_SOURCE
 
-    # milabench pin --variant cuda --from-scratch "$@" 
+    # need torch for pinning
+    pip install torch
+    milabench pin --variant cuda --from-scratch "$@" 
+
+    milabench slurm_system > $MILABENCH_WORDIR/system.yaml
 
     #
     # Install milabench's benchmarks in their venv
     #
-    milabench install "$@"
+    milabench install --system $MILABENCH_WORDIR/system.yaml "$@"
 
     which pip
     # pip install -e $MILABENCH_WORDIR/voir
     # pip install -e $MILABENCH_WORDIR/torchcompat
 
     (
-        . $BENCHMARK_VENV/bin/activate
-        which pip
+        echo "Pass"
+        # . $BENCHMARK_VENV/bin/activate
+        # which pip
         #pip install -e $MILABENCH_WORDIR/voir
         # pip install -e $MILABENCH_WORDIR/torchcompat
         # pip install torch torchvision torchaudio
@@ -79,12 +96,12 @@ install_prepare() {
 
     #
     #   Generate/download datasets, download models etc...
-    milabench prepare "$@"
+    milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@"
 }
 
 module load cuda/12.3.2
 
-if [ ! -d "$MILABENCH_WORDIR/results/venv/torch" ]; then
+if [ ! -d "$MILABENCH_VENV" ]; then
     install_prepare 
 else
     echo "Reusing previous install"
@@ -92,29 +109,16 @@ else
 fi
 
 
-(
-    . $MILABENCH_WORDIR/env/bin/activate
-    pip show setuptools
-    pip show pip
-    pip install git+https://github.com/Delaunay/voir.git@patch-8
-)
-
-(
-    . $BENCHMARK_VENV/bin/activate
-    pip show setuptools
-    pip show pip
-    pip install git+https://github.com/Delaunay/voir.git@patch-8
-)
-
-
 if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     cd $MILABENCH_WORDIR
     
 
+    # milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@"
+    
     # milabench prepare "$@"
     #
     #   Run the benchmakrs
-    milabench run "$@"
+    milabench run --system $MILABENCH_WORDIR/system.yaml "$@"
 
     #
     #   Display report
diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh
index 79e736c20..b8a15fb76 100644
--- a/scripts/article/run_rocm.sh
+++ b/scripts/article/run_rocm.sh
@@ -31,7 +31,7 @@ install_prepare() {
     # Override/add package to milabench venv here
     #
     which pip
-    # pip install ...
+    pip uninstall pynvml
 
     (
         . $BENCHMARK_VENV/bin/activate
@@ -41,7 +41,24 @@ install_prepare() {
         #
         which pip
         pip uninstall torch torchvision torchaudio
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
+        pip uninstall pynvml
+
+        # sudo apt-get install lld
+        # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30
+        # does not really work
+        pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl
+        pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz
+
+        # 
+        FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git
+        FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_scatter.git
+        FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_sparse.git
+
+        # takes forever to compile
+        # https://github.com/ROCm/xformers
+        pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/xformers.git@develop#egg=xformers
+        pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/flash-attention.git
     )
 
     #
diff --git a/scripts/article/run_update_batch_size.sh b/scripts/article/run_update_batch_size.sh
new file mode 100644
index 000000000..f839f952d
--- /dev/null
+++ b/scripts/article/run_update_batch_size.sh
@@ -0,0 +1,33 @@
+
+
+
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=1
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=2
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=4
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=8
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=16
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
\ No newline at end of file
diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt
index 7f1d5dc83..f3ff218ae 100644
--- a/tests/test_command_reg/test_command_reg_one_node.txt
+++ b/tests/test_command_reg/test_command_reg_one_node.txt
@@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data
 export MILABENCH_DIR_RUNS=$BASE/runs
 export MILABENCH_DIR_EXTRA=$BASE/extra/llm
 export MILABENCH_DIR_CACHE=$BASE/cache
-export OMP_NUM_THREADS=4
-export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+export OMP_NUM_THREADS=0
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
 
 echo "---"
 echo "llama"
@@ -37,14 +37,14 @@ echo "---"
 echo "fp16"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
   wait
 )
 
@@ -52,14 +52,14 @@ echo "---"
 echo "bf16"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
   wait
 )
 
@@ -67,14 +67,14 @@ echo "---"
 echo "tf32"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
   wait
 )
 
@@ -82,14 +82,14 @@ echo "---"
 echo "fp32"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
   wait
 )
 
@@ -127,7 +127,7 @@ echo "---"
 echo "resnet152-ddp-gpus"
 echo "=================="
 time (
-  $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -353,7 +353,7 @@ echo "---"
 echo "diffusion-single"
 echo "================"
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
   wait
 )
 
@@ -361,7 +361,7 @@ echo "---"
 echo "diffusion-gpus"
 echo "=============="
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
   wait
 )
 
@@ -369,7 +369,7 @@ echo "---"
 echo "diffusion-nodes"
 echo "==============="
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
   wait
 )
 
@@ -377,14 +377,14 @@ echo "---"
 echo "lightning"
 echo "========="
 time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -392,7 +392,7 @@ echo "---"
 echo "lightning-gpus"
 echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -400,14 +400,14 @@ echo "---"
 echo "dinov2-giant-single"
 echo "==================="
 time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
@@ -415,15 +415,7 @@ echo "---"
 echo "dinov2-giant-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  wait
-)
-
-echo "---"
-echo "dinov2-giant-nodes"
-echo "=================="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-nodes/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
@@ -446,7 +438,7 @@ echo "---"
 echo "llm-lora-ddp-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -454,7 +446,7 @@ echo "---"
 echo "llm-lora-ddp-nodes"
 echo "=================="
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -462,7 +454,7 @@ echo "---"
 echo "llm-lora-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -470,7 +462,7 @@ echo "---"
 echo "llm-full-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -478,7 +470,52 @@ echo "---"
 echo "llm-full-mp-nodes"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  wait
+)
+
+echo "---"
+echo "dimenet"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  wait
+)
+
+echo "---"
+echo "recursiongfn"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  wait
+)
+
+echo "---"
+echo "torchatari"
+echo "=========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
   wait
 )
 
diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt
index 479a57859..bda22033e 100644
--- a/tests/test_command_reg/test_command_reg_two_nodes.txt
+++ b/tests/test_command_reg/test_command_reg_two_nodes.txt
@@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data
 export MILABENCH_DIR_RUNS=$BASE/runs
 export MILABENCH_DIR_EXTRA=$BASE/extra/llm
 export MILABENCH_DIR_CACHE=$BASE/cache
-export OMP_NUM_THREADS=4
-export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}, {"ip": "192.168.0.11", "main": false, "name": "1", "port": 22, "user": "username", "hostname": "192.168.0.11", "aliaslist": [], "ipaddrlist": ["192.168.0.11"], "local": false}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+export OMP_NUM_THREADS=0
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
 
 echo "---"
 echo "llama"
@@ -37,14 +37,14 @@ echo "---"
 echo "fp16"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
   wait
 )
 
@@ -52,14 +52,14 @@ echo "---"
 echo "bf16"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
   wait
 )
 
@@ -67,14 +67,14 @@ echo "---"
 echo "tf32"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
   wait
 )
 
@@ -82,14 +82,14 @@ echo "---"
 echo "fp32"
 echo "===="
 time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
   wait
 )
 
@@ -127,7 +127,7 @@ echo "---"
 echo "resnet152-ddp-gpus"
 echo "=================="
 time (
-  $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -353,7 +353,7 @@ echo "---"
 echo "diffusion-single"
 echo "================"
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
   wait
 )
 
@@ -361,7 +361,7 @@ echo "---"
 echo "diffusion-gpus"
 echo "=============="
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
   wait
 )
 
@@ -369,8 +369,8 @@ echo "---"
 echo "diffusion-nodes"
 echo "==============="
 time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 &
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
   wait
 )
 
@@ -378,14 +378,14 @@ echo "---"
 echo "lightning"
 echo "========="
 time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -393,7 +393,7 @@ echo "---"
 echo "lightning-gpus"
 echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -401,14 +401,14 @@ echo "---"
 echo "dinov2-giant-single"
 echo "==================="
 time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
@@ -416,16 +416,7 @@ echo "---"
 echo "dinov2-giant-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  wait
-)
-
-echo "---"
-echo "dinov2-giant-nodes"
-echo "=================="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-nodes/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/benchrun --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-nodes/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
@@ -448,7 +439,7 @@ echo "---"
 echo "llm-lora-ddp-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -456,7 +447,8 @@ echo "---"
 echo "llm-lora-ddp-nodes"
 echo "=================="
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -464,7 +456,7 @@ echo "---"
 echo "llm-lora-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -472,7 +464,7 @@ echo "---"
 echo "llm-full-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -480,7 +472,53 @@ echo "---"
 echo "llm-full-mp-nodes"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  wait
+)
+
+echo "---"
+echo "dimenet"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
+  wait
+)
+
+echo "---"
+echo "recursiongfn"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  wait
+)
+
+echo "---"
+echo "torchatari"
+echo "=========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
   wait
 )