From 4fb687c744cb55f7f9098fd67a5eecbb7c6fe609 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 30 Oct 2024 13:31:28 +0000
Subject: [PATCH 01/20] Tweaks

---
 milabench/_version.py       |  6 +++---
 scripts/article/run_cuda.sh | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/milabench/_version.py b/milabench/_version.py
index cdd2418d..e2795a03 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-9-g6d1e1140"
-__commit__ = "6d1e114000cc4200ea307330032234db6696e40d"
-__date__ = "2024-09-30 14:39:43 -0400"
+__tag__ = "v1.0.0_RC1-12-g3b87cb4"
+__commit__ = "3b87cb465e855be452953273c314ab01024e0925"
+__date__ = "2024-10-09 12:04:43 -0400"
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index ba4c1ae3..47b21313 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -49,8 +49,8 @@ install_prepare() {
     # Install milabench's benchmarks in their venv
     #
     # pip install torch
-    milabench pin --variant cuda --from-scratch $ARGS 
-    milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS
+    # milabench pin --variant cuda --from-scratch $ARGS 
+    milabench install # --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     which pip
 
@@ -67,10 +67,10 @@ install_prepare() {
 
     #
     #   Generate/download datasets, download models etc...
-    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
+    milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS
 }
 
-module load cuda/12.3.2
+# module load cuda/12.3.2
 
 if [ ! -d "$MILABENCH_WORDIR/results" ]; then
     install_prepare 
@@ -89,7 +89,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     # rm -rf $MILABENCH_WORDIR/results/venv/
     # rm -rf $MILABENCH_WORDIR/results/extra
     # milabench install --system $MILABENCH_WORDIR/system.yaml
-    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
+    milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     (
         . $BENCHMARK_VENV/bin/activate
@@ -117,7 +117,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     #     milabench run --run-name "c$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml $ARGS || true
     # done
 
-    milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS
+    milabench run # --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     #
     #   Display report

From a849fc2f6b2ed1204d5f3aa1f27640d5fbd99dd9 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 30 Oct 2024 13:38:08 +0000
Subject: [PATCH 02/20] -

---
 milabench/system.py         |  1 +
 scripts/article/run_cuda.sh | 32 +++++++-------------------------
 2 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/milabench/system.py b/milabench/system.py
index c237baf2..3a50d143 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -406,6 +406,7 @@ def resolve_hostname(ip):
             if is_loopback(ip):
                 return hostname, True
 
+        return socket.gethostname(), hostname.startswith(socket.gethostname())
         return hostname, hostname == socket.gethostname()
 
     except:
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index 47b21313..e56b2da1 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -50,7 +50,7 @@ install_prepare() {
     #
     # pip install torch
     # milabench pin --variant cuda --from-scratch $ARGS 
-    milabench install # --system $MILABENCH_WORDIR/system.yaml $ARGS
+    milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     which pip
 
@@ -67,7 +67,7 @@ install_prepare() {
 
     #
     #   Generate/download datasets, download models etc...
-    milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS
+    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 }
 
 # module load cuda/12.3.2
@@ -87,9 +87,10 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     # pip install torch
     # milabench pin --variant cuda --from-scratch 
     # rm -rf $MILABENCH_WORDIR/results/venv/
-    # rm -rf $MILABENCH_WORDIR/results/extra
-    # milabench install --system $MILABENCH_WORDIR/system.yaml
-    milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS
+    rm -rf $MILABENCH_WORDIR/results/extra
+    
+    milabench install --system $MILABENCH_WORDIR/system.yaml
+    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     (
         . $BENCHMARK_VENV/bin/activate
@@ -98,26 +99,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
         # pip install torchao --no-input
     )
 
-    # pip install torch
-    # milabench pin --variant cuda --from-scratch 
-    # milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS
-    # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
-
-    # ARGS="--select resnet50-noio,brax,lightning,dinov2-giant-single,dinov2-giant-gpus,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-full-mp-gpus,llm-full-mp-nodes,dqn,ppo,dimenet,llava-single,rlhf-single,rlhf-gpus,vjepa-single,vjepa-gpus"
-
-    # MEMORY_CAPACITY=("4Go" "8Go" "16Go" "32Go" "64Go" "80Go")
-    # # MEMORY_CAPACITY=("2048" "4096" "8192")
-
-    # #   Run the benchmakrs 
-    # for CAPACITY in "${MEMORY_CAPACITY[@]}"; do
-    #     export MILABENCH_SIZER_AUTO=1
-    #     export MILABENCH_SIZER_MULTIPLE=8
-    #     export MILABENCH_SIZER_CAPACITY=$CAPACITY
-    #     # export MILABENCH_SIZER_BATCH_SIZE=$CAPACITY
-    #     milabench run --run-name "c$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml $ARGS || true
-    # done
-
-    milabench run # --system $MILABENCH_WORDIR/system.yaml $ARGS
+    milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     #
     #   Display report

From dafcbf94c4e5957c2db0cdbd592eb5923d39ad80 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 30 Oct 2024 14:10:38 +0000
Subject: [PATCH 03/20] Ignore prepare & install runs for reports

---
 milabench/compare.py        | 3 +++
 scripts/article/run_cuda.sh | 9 +++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/milabench/compare.py b/milabench/compare.py
index cae06820..d4d6299e 100644
--- a/milabench/compare.py
+++ b/milabench/compare.py
@@ -27,6 +27,9 @@ def fetch_runs(folder, filter):
     runs = []
     ignored = 0
     for run in os.listdir(folder):
+        if run.startswith("install") or run.startswith("prepare"):
+            continue
+    
         if filter is not None and (not fnmatch.fnmatch(run, filter)):
             ignored += 1
             continue
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index e56b2da1..12ffffe6 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -72,7 +72,7 @@ install_prepare() {
 
 # module load cuda/12.3.2
 
-if [ ! -d "$MILABENCH_WORDIR/results" ]; then
+if [ ! -d "$MILABENCH_WORDIR/env" ]; then
     install_prepare 
 else
     echo "Reusing previous install"
@@ -104,4 +104,9 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     #
     #   Display report
     milabench report --runs $MILABENCH_WORDIR/results/runs
-fi
\ No newline at end of file
+fi
+
+
+# rsync -av mila@172.29.171.42:~/rocm/results/cache ~/cuda/results/cache
+# rsync -av mila@172.29.171.42:~/rocm/results/data ~/cuda/results/data
+# rsync -av mila@172.29.171.42:~/rocm/results/cache ~/cuda/results/cache
\ No newline at end of file

From 783a13b4bd87db7fad8e848191a18557c40b6eb5 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Sat, 16 Nov 2024 13:06:45 -0500
Subject: [PATCH 04/20] Update report.py

---
 milabench/report.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/milabench/report.py b/milabench/report.py
index c54ed8dd..da68c80a 100644
--- a/milabench/report.py
+++ b/milabench/report.py
@@ -417,12 +417,17 @@ def _score(column):
             # This computes a weighted geometric mean
 
             # perf can be object np.float64 !?
-            perf = df[column].astype(float)
+            # success_ratio = 1 - row["fail"] / max(row["n"], 1)
+            
+            # score = (acc if acc > 0 else row["perf"]) * success_ratio
+            score = df[column].astype(float)
 
             weights = df["weight"] * df["enabled"].astype(int)
-            weight_total = np.sum(weights)
+            # if total weight is 0 ?
+            weight_total = np.sum(weights) 
 
-            logscore = np.sum(np.log(perf) * weights) / weight_total
+            # score cannot be 0
+            logscore = np.sum(np.log(score + 1) * weights) / weight_total
             return np.exp(logscore)
         except ZeroDivisionError:
             return 0

From ecd92149d26b8372da73db28b9440e5c61fd9769 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 12:41:28 -0500
Subject: [PATCH 05/20] Consolidate metric pushing logic

---
 benchmarks/diffusion/main.py                |  5 ++
 benchmarks/flops/benchfile.py               | 18 ++++---
 benchmarks/flops/dev.yaml                   | 56 +++++++++++++++++++++
 benchmarks/flops/dev/extra/flops/mark_torch |  0
 benchmarks/flops/main.py                    |  3 +-
 benchmarks/flops/requirements.cpu.txt       |  5 ++
 benchmarks/flops/simple.sh                  | 13 +++++
 benchmarks/geo_gnn/modelsize.py             | 36 +++++++++++++
 benchmarks/purejaxrl/dqn.py                 |  5 ++
 benchmarks/purejaxrl/ppo.py                 |  6 +++
 benchmarks/recursiongfn/main.py             |  4 +-
 benchmarks/torchatari/main.py               |  3 ++
 benchmate/benchmate/models.py               | 36 +++++++++++++
 benchmate/benchmate/monitor.py              | 30 +++++++----
 milabench/_version.py                       |  6 +--
 milabench/pack.py                           |  2 +
 16 files changed, 208 insertions(+), 20 deletions(-)
 create mode 100644 benchmarks/flops/dev.yaml
 create mode 100644 benchmarks/flops/dev/extra/flops/mark_torch
 create mode 100644 benchmarks/flops/requirements.cpu.txt
 create mode 100644 benchmarks/flops/simple.sh
 create mode 100644 benchmarks/geo_gnn/modelsize.py
 create mode 100644 benchmate/benchmate/models.py

diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py
index 0bcb67d5..c5b7757e 100755
--- a/benchmarks/diffusion/main.py
+++ b/benchmarks/diffusion/main.py
@@ -57,6 +57,11 @@ def models(accelerator, args: Arguments):
     unet = UNet2DConditionModel.from_pretrained(
         args.model, subfolder="unet", revision=args.revision, variant=args.variant
     )
+    
+    from benchmate.models import model_size
+    print(model_size(unet))
+    print(model_size(encoder))
+    print(model_size(vae))
 
     vae.requires_grad_(False)
     encoder.requires_grad_(False)
diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py
index 59c5c4a7..9117caf0 100644
--- a/benchmarks/flops/benchfile.py
+++ b/benchmarks/flops/benchfile.py
@@ -8,11 +8,17 @@ class FlopsBenchmarch(Package):
 
     def build_run_plan(self) -> "Command":
         import milabench.commands as cmd
-
-        pack = cmd.PackCommand(self, *self.argv, lazy=True)
-        # pack = cmd.VoirCommand(pack, cwd=main.parent)
-        pack = cmd.ActivatorCommand(pack)
-        return pack.use_stdout()
-
+        
+        use_stdout = True
+        
+        if use_stdout:
+            main = self.dirs.code / self.main_script
+            pack = cmd.PackCommand(self, *self.argv, lazy=True)
+            return pack.use_stdout()
+        else:
+            main = self.dirs.code / self.main_script
+            pack = cmd.PackCommand(self, *self.argv, lazy=True)
+            pack = cmd.VoirCommand(pack, cwd=main.parent)
+            return pack
 
 __pack__ = FlopsBenchmarch
diff --git a/benchmarks/flops/dev.yaml b/benchmarks/flops/dev.yaml
new file mode 100644
index 00000000..5106bf40
--- /dev/null
+++ b/benchmarks/flops/dev.yaml
@@ -0,0 +1,56 @@
+
+
+_flops:
+  inherits: _defaults
+  definition: .
+  group: flops
+  install-variant: unpinned
+  install_group: torch
+  plan:
+    method: per_gpu
+  
+  tags:
+    - diagnostic
+    - flops
+    - monogpu
+    - nobatch
+  
+  argv:
+    --number: 30
+    --repeat: 90
+
+
+fp16:
+  inherits: _flops
+
+  argv:
+    --number: 30
+    --repeat: 10
+    --m: 8192
+    --n: 8192
+    --dtype: fp16
+
+bf16:
+  inherits: _flops
+ 
+  argv:
+    --m: 8192
+    --n: 8192
+    --dtype: bf16
+
+tf32:
+  inherits: _flops
+ 
+  argv:
+    --m: 8192
+    --n: 8192
+    --dtype: fp32
+    --tf32: true
+
+fp32:
+  inherits: _flops
+ 
+  argv:
+    --m: 256
+    --n: 256
+    --dtype: fp32  
diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py
index e4f05c17..ba03518a 100755
--- a/benchmarks/flops/main.py
+++ b/benchmarks/flops/main.py
@@ -109,11 +109,12 @@ def main():
 
     log, monitor = setupvoir()
 
+    # FIXME
+    #with monitor:
     f(args.number, args.repeat, args.m, args.n, TERA, dtypes[args.dtype], log)
 
     monitor.stop()
 
-
 if __name__ == "__main__":
     main()
     print("done")
diff --git a/benchmarks/flops/requirements.cpu.txt b/benchmarks/flops/requirements.cpu.txt
new file mode 100644
index 00000000..88f8b61e
--- /dev/null
+++ b/benchmarks/flops/requirements.cpu.txt
@@ -0,0 +1,5 @@
+torch
+torchvision
+torchcompat
+tqdm
+voir
diff --git a/benchmarks/flops/simple.sh b/benchmarks/flops/simple.sh
new file mode 100644
index 00000000..3f54d424
--- /dev/null
+++ b/benchmarks/flops/simple.sh
@@ -0,0 +1,13 @@
+
+
+
+
+export MILABENCH_BASE="$(pwd)/dev"
+export MILABENCH_CONFIG="$(pwd)/dev.yaml"
+
+
+milabench install  --select fp32
+
+milabench prepare  --select fp32
+
+milabench run --select fp32
diff --git a/benchmarks/geo_gnn/modelsize.py b/benchmarks/geo_gnn/modelsize.py
new file mode 100644
index 00000000..0b65655a
--- /dev/null
+++ b/benchmarks/geo_gnn/modelsize.py
@@ -0,0 +1,36 @@
+from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet
+
+import torch
+
+from benchmate.models import model_size
+
+
+print(model_size(_DimeNet(
+    hidden_channels=64,
+    out_channels=1,
+    num_blocks=6,
+    num_bilinear=8,
+    num_spherical=7,
+    num_radial=6,
+    cutoff=10.0,
+    envelope_exponent=5,
+    num_before_skip=1,
+    num_after_skip=2,
+    num_output_layers=3,
+)
+
+))
+
+print(model_size(
+_PNA(
+    # Basic GCNN setup
+    in_channels=1, 
+    out_channels=1,
+    hidden_channels=64,
+    num_layers=64,
+    # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html
+    aggregators=['mean', 'min', 'max', 'std'],
+    scalers=['identity', 'amplification', 'attenuation'],
+    # Histogram of in-degrees of nodes in the training set, used by scalers to normalize
+    deg=torch.tensor(4),
+)))
\ No newline at end of file
diff --git a/benchmarks/purejaxrl/dqn.py b/benchmarks/purejaxrl/dqn.py
index fc0a97b8..85e9e8b5 100644
--- a/benchmarks/purejaxrl/dqn.py
+++ b/benchmarks/purejaxrl/dqn.py
@@ -98,6 +98,11 @@ def train(rng):
         init_x = jnp.zeros(env.observation_space(env_params).shape)
         network_params = network.init(_rng, init_x)
 
+
+        param_count = sum(x.size for x in jax.tree.leaves(network_params))
+        print("PARAM COUNT", param_count)
+        
+        
         def linear_schedule(count):
             frac = 1.0 - (count / config["NUM_UPDATES"])
             return config["LR"] * frac
diff --git a/benchmarks/purejaxrl/ppo.py b/benchmarks/purejaxrl/ppo.py
index 0cc8896c..a70e195e 100644
--- a/benchmarks/purejaxrl/ppo.py
+++ b/benchmarks/purejaxrl/ppo.py
@@ -107,9 +107,15 @@ def train(rng):
         network = ActorCritic(
             env.action_space(env_params).shape[0], activation=config["ACTIVATION"]
         )
+        
+        
         rng, _rng = jax.random.split(rng)
         init_x = jnp.zeros(env.observation_space(env_params).shape)
         network_params = network.init(_rng, init_x)
+        
+        param_count = sum(x.size for x in jax.tree.leaves(network_params))
+        print("PARAM COUNT", param_count)
+        
         if config["ANNEAL_LR"]:
             tx = optax.chain(
                 optax.clip_by_global_norm(config["MAX_GRAD_NORM"]),
diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py
index 81d08e8a..7099247d 100644
--- a/benchmarks/recursiongfn/main.py
+++ b/benchmarks/recursiongfn/main.py
@@ -92,11 +92,13 @@ def __init__(
         self.num_cond_dim = self.temperature_conditional.encoding_size()
 
     def _load_task_models(self):
-        xdg_cache = os.environ["XDG_CACHE_HOME"]
+        xdg_cache = os.environ.get("XDG_CACHE_HOME")
         model = bengio2021flow.load_original_model(
             cache=True,
             location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")),
         )
+        from benchmate.models import model_size
+        print(model_size(model))
         model.to(get_worker_device())
         model = self._wrap_model(model)
         return {"seh": model}
diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py
index bf5b7ef6..898a8cab 100644
--- a/benchmarks/torchatari/main.py
+++ b/benchmarks/torchatari/main.py
@@ -201,7 +201,10 @@ def main():
     envs = RecordEpisodeStatistics(envs)
     assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported"
 
+
+    from benchmate.models import model_size
     agent = Agent(envs).to(device)
+    print(model_size(agent))
     optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
 
     # ALGO Logic: Storage setup
diff --git a/benchmate/benchmate/models.py b/benchmate/benchmate/models.py
new file mode 100644
index 00000000..efd13e2c
--- /dev/null
+++ b/benchmate/benchmate/models.py
@@ -0,0 +1,36 @@
+
+
+def model_summary(model, input_shape):
+    try:
+        from torchsummary import summary
+        
+        summary(model, input_shape)
+    except:
+        print("Could not print summary")
+
+
+def model_size(model):
+    param_size = 0
+    param_count = 0
+    for param in model.parameters():
+        param_count += param.nelement()
+        param_size += param.nelement() * param.element_size()
+    
+    buffer_size = 0
+    buffer_count = 0
+    for buff in model.buffers():
+        buffer_count += buff.nelement()
+        buffer_size += buff.nelement() * buff.element_size()
+    
+    return {
+        "param": {
+            "count": param_count,
+            "size": param_size / 1024**2,
+            "unit": "MB"
+        },
+        "buffer": {
+            "count": buffer_count,
+            "size": buffer_size / 1024**2,
+            "unit": "MB"
+        }
+    }
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index 0ad34a3d..5c63796e 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -13,7 +13,25 @@
 from voir.instruments.io import io_monitor
 from voir.instruments.network import network_monitor
 from voir.instruments.monitor import monitor
+from voir.helpers import current_overseer
 
+from .metrics import sumggle_push, give_push, file_push
+
+
+def auto_push():
+    # Milabench managed: we need to push metrics to it
+    if int(os.getenv("MILABENCH_MANAGED", 0)) == 1:
+        
+        # Using voir, DATA_FD is defined as well
+        ov = current_overseer.get()
+        if ov is not None:
+            return ov.give
+
+        # Not using Voir, using structured stdout
+        return sumggle_push()
+
+    # Not using milabench; using stdout
+    return file_push()
 
 
 @instrument_definition
@@ -41,16 +59,10 @@ def monitor_node(ov, poll_interval=1, arch=None):
 
 
 def _smuggle_monitor(poll_interval=10, worker_init=None, **monitors):
-    data_file = SmuggleWriter(sys.stdout)
+    log = auto_push()
+    
     def mblog(data):
-        nonlocal data_file
-
-        if data_file is not None:
-            try:
-                print(json.dumps(data), file=data_file)
-            except ValueError:
-                pass
-                # print("Is bench ending?, ignoring ValueError")
+        log(**data)
     
     def get():
         t = time.time()
diff --git a/milabench/_version.py b/milabench/_version.py
index cdd2418d..5f55a16d 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-9-g6d1e1140"
-__commit__ = "6d1e114000cc4200ea307330032234db6696e40d"
-__date__ = "2024-09-30 14:39:43 -0400"
+__tag__ = "v0.1.0-129-ga60a3aa"
+__commit__ = "a60a3aae21e87e46bcce403620a3f56c12878554"
+__date__ = "2024-11-06 22:52:12 -0500"
diff --git a/milabench/pack.py b/milabench/pack.py
index 1cdde093..20feca39 100644
--- a/milabench/pack.py
+++ b/milabench/pack.py
@@ -335,6 +335,8 @@ def make_env(self):
             f"MILABENCH_DIR_{name.upper()}": path
             for name, path in self.config["dirs"].items()
         }
+        
+        env["MILABENCH_MANAGED"] = "1"
 
         env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}")
 

From 8148e5388cd3ecbc2b6ea8e4c2a4efe6c63333f8 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 13:02:00 -0500
Subject: [PATCH 06/20] Tweaks

---
 benchmarks/flops/benchfile.py   |  8 +++-----
 benchmate/benchmate/monitor.py  | 13 +++++++++----
 milabench/commands/executors.py |  3 +++
 milabench/report.py             |  4 ++--
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py
index 9117caf0..3090a3c7 100644
--- a/benchmarks/flops/benchfile.py
+++ b/benchmarks/flops/benchfile.py
@@ -8,16 +8,14 @@ class FlopsBenchmarch(Package):
 
     def build_run_plan(self) -> "Command":
         import milabench.commands as cmd
-        
+        main = self.dirs.code / self.main_script
+        pack = cmd.PackCommand(self, *self.argv, lazy=True)
+            
         use_stdout = True
         
         if use_stdout:
-            main = self.dirs.code / self.main_script
-            pack = cmd.PackCommand(self, *self.argv, lazy=True)
             return pack.use_stdout()
         else:
-            main = self.dirs.code / self.main_script
-            pack = cmd.PackCommand(self, *self.argv, lazy=True)
             pack = cmd.VoirCommand(pack, cwd=main.parent)
             return pack
 
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index 5c63796e..294d0c88 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -19,16 +19,21 @@
 
 
 def auto_push():
+    # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0))
+    mb_managed = int(os.getenv("MILABENCH_MANAGED", 0))
+
     # Milabench managed: we need to push metrics to it
-    if int(os.getenv("MILABENCH_MANAGED", 0)) == 1:
-        
+    if mb_managed == 1:
         # Using voir, DATA_FD is defined as well
         ov = current_overseer.get()
         if ov is not None:
             return ov.give
-
+        
         # Not using Voir, using structured stdout
-        return sumggle_push()
+        if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1:
+            return sumggle_push()
+
+        raise RuntimeError("Could not find something to push to")
 
     # Not using milabench; using stdout
     return file_push()
diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py
index f0402d29..807a261e 100644
--- a/milabench/commands/executors.py
+++ b/milabench/commands/executors.py
@@ -32,6 +32,9 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals
     sized_args = scale_argv(pack, args)
     final_args = resolve_argv(pack, sized_args)
 
+    if use_stdout:
+        exec_env["MILABENCH_USE_STDOUT"] = "1"
+
     return await run(
         final_args,
         **kwargs,
diff --git a/milabench/report.py b/milabench/report.py
index c54ed8dd..bdc4999c 100644
--- a/milabench/report.py
+++ b/milabench/report.py
@@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters):
     # Compute column size
     col_size = defaultdict(int)
     for index, row in df.iterrows():
-        col_size["bench"] = max(col_size["bench"], len(index))
+        col_size["bench"] = max(col_size["bench"], len(index), len("bench"))
         for col, val in zip(columns, row):
             fmt = formatters.get(col)
             if fmt is not None:
                 val = fmt(val)
-                col_size[col] = max(col_size[col], len(val))
+                col_size[col] = max(col_size[col], len(val), len(col))
 
     # Generate report
     sep = " | "

From 3d2718031897bcc5eac93e1304442ba528f5840c Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 13:04:41 -0500
Subject: [PATCH 07/20] Revert bad commit

---
 benchmarks/diffusion/main.py                |  5 --
 benchmarks/flops/benchfile.py               | 14 ++----
 benchmarks/flops/dev.yaml                   | 56 ---------------------
 benchmarks/flops/dev/extra/flops/mark_torch |  0
 benchmarks/flops/main.py                    |  3 +-
 benchmarks/flops/requirements.cpu.txt       |  5 --
 benchmarks/flops/simple.sh                  | 13 -----
 benchmarks/geo_gnn/modelsize.py             | 36 -------------
 benchmarks/purejaxrl/dqn.py                 |  5 --
 benchmarks/purejaxrl/ppo.py                 |  6 ---
 benchmarks/recursiongfn/main.py             |  4 +-
 benchmarks/torchatari/main.py               |  3 --
 benchmate/benchmate/models.py               | 36 -------------
 benchmate/benchmate/monitor.py              | 35 ++++---------
 milabench/_version.py                       |  6 +--
 milabench/commands/executors.py             |  3 --
 milabench/pack.py                           |  2 -
 milabench/report.py                         |  4 +-
 18 files changed, 21 insertions(+), 215 deletions(-)
 delete mode 100644 benchmarks/flops/dev.yaml
 delete mode 100644 benchmarks/flops/dev/extra/flops/mark_torch
 delete mode 100644 benchmarks/flops/requirements.cpu.txt
 delete mode 100644 benchmarks/flops/simple.sh
 delete mode 100644 benchmarks/geo_gnn/modelsize.py
 delete mode 100644 benchmate/benchmate/models.py

diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py
index c5b7757e..0bcb67d5 100755
--- a/benchmarks/diffusion/main.py
+++ b/benchmarks/diffusion/main.py
@@ -57,11 +57,6 @@ def models(accelerator, args: Arguments):
     unet = UNet2DConditionModel.from_pretrained(
         args.model, subfolder="unet", revision=args.revision, variant=args.variant
     )
-    
-    from benchmate.models import model_size
-    print(model_size(unet))
-    print(model_size(encoder))
-    print(model_size(vae))
 
     vae.requires_grad_(False)
     encoder.requires_grad_(False)
diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py
index 3090a3c7..59c5c4a7 100644
--- a/benchmarks/flops/benchfile.py
+++ b/benchmarks/flops/benchfile.py
@@ -8,15 +8,11 @@ class FlopsBenchmarch(Package):
 
     def build_run_plan(self) -> "Command":
         import milabench.commands as cmd
-        main = self.dirs.code / self.main_script
+
         pack = cmd.PackCommand(self, *self.argv, lazy=True)
-            
-        use_stdout = True
-        
-        if use_stdout:
-            return pack.use_stdout()
-        else:
-            pack = cmd.VoirCommand(pack, cwd=main.parent)
-            return pack
+        # pack = cmd.VoirCommand(pack, cwd=main.parent)
+        pack = cmd.ActivatorCommand(pack)
+        return pack.use_stdout()
+
 
 __pack__ = FlopsBenchmarch
diff --git a/benchmarks/flops/dev.yaml b/benchmarks/flops/dev.yaml
deleted file mode 100644
index 5106bf40..00000000
--- a/benchmarks/flops/dev.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-
-
-_flops:
-  inherits: _defaults
-  definition: .
-  group: flops
-  install-variant: unpinned
-  install_group: torch
-  plan:
-    method: per_gpu
-  
-  tags:
-    - diagnostic
-    - flops
-    - monogpu
-    - nobatch
-  
-  argv:
-    --number: 30
-    --repeat: 90
-
-
-fp16:
-  inherits: _flops
-
-  argv:
-    --number: 30
-    --repeat: 10
-    --m: 8192
-    --n: 8192
-    --dtype: fp16
-
-bf16:
-  inherits: _flops
- 
-  argv:
-    --m: 8192
-    --n: 8192
-    --dtype: bf16
-
-tf32:
-  inherits: _flops
- 
-  argv:
-    --m: 8192
-    --n: 8192
-    --dtype: fp32
-    --tf32: true
-
-fp32:
-  inherits: _flops
- 
-  argv:
-    --m: 256
-    --n: 256
-    --dtype: fp32  
diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py
index ba03518a..e4f05c17 100755
--- a/benchmarks/flops/main.py
+++ b/benchmarks/flops/main.py
@@ -109,12 +109,11 @@ def main():
 
     log, monitor = setupvoir()
 
-    # FIXME
-    #with monitor:
     f(args.number, args.repeat, args.m, args.n, TERA, dtypes[args.dtype], log)
 
     monitor.stop()
 
+
 if __name__ == "__main__":
     main()
     print("done")
diff --git a/benchmarks/flops/requirements.cpu.txt b/benchmarks/flops/requirements.cpu.txt
deleted file mode 100644
index 88f8b61e..00000000
--- a/benchmarks/flops/requirements.cpu.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-torch
-torchvision
-torchcompat
-tqdm
-voir
diff --git a/benchmarks/flops/simple.sh b/benchmarks/flops/simple.sh
deleted file mode 100644
index 3f54d424..00000000
--- a/benchmarks/flops/simple.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-
-
-export MILABENCH_BASE="$(pwd)/dev"
-export MILABENCH_CONFIG="$(pwd)/dev.yaml"
-
-
-milabench install  --select fp32
-
-milabench prepare  --select fp32
-
-milabench run --select fp32
diff --git a/benchmarks/geo_gnn/modelsize.py b/benchmarks/geo_gnn/modelsize.py
deleted file mode 100644
index 0b65655a..00000000
--- a/benchmarks/geo_gnn/modelsize.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet
-
-import torch
-
-from benchmate.models import model_size
-
-
-print(model_size(_DimeNet(
-    hidden_channels=64,
-    out_channels=1,
-    num_blocks=6,
-    num_bilinear=8,
-    num_spherical=7,
-    num_radial=6,
-    cutoff=10.0,
-    envelope_exponent=5,
-    num_before_skip=1,
-    num_after_skip=2,
-    num_output_layers=3,
-)
-
-))
-
-print(model_size(
-_PNA(
-    # Basic GCNN setup
-    in_channels=1, 
-    out_channels=1,
-    hidden_channels=64,
-    num_layers=64,
-    # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html
-    aggregators=['mean', 'min', 'max', 'std'],
-    scalers=['identity', 'amplification', 'attenuation'],
-    # Histogram of in-degrees of nodes in the training set, used by scalers to normalize
-    deg=torch.tensor(4),
-)))
\ No newline at end of file
diff --git a/benchmarks/purejaxrl/dqn.py b/benchmarks/purejaxrl/dqn.py
index 85e9e8b5..fc0a97b8 100644
--- a/benchmarks/purejaxrl/dqn.py
+++ b/benchmarks/purejaxrl/dqn.py
@@ -98,11 +98,6 @@ def train(rng):
         init_x = jnp.zeros(env.observation_space(env_params).shape)
         network_params = network.init(_rng, init_x)
 
-
-        param_count = sum(x.size for x in jax.tree.leaves(network_params))
-        print("PARAM COUNT", param_count)
-        
-        
         def linear_schedule(count):
             frac = 1.0 - (count / config["NUM_UPDATES"])
             return config["LR"] * frac
diff --git a/benchmarks/purejaxrl/ppo.py b/benchmarks/purejaxrl/ppo.py
index a70e195e..0cc8896c 100644
--- a/benchmarks/purejaxrl/ppo.py
+++ b/benchmarks/purejaxrl/ppo.py
@@ -107,15 +107,9 @@ def train(rng):
         network = ActorCritic(
             env.action_space(env_params).shape[0], activation=config["ACTIVATION"]
         )
-        
-        
         rng, _rng = jax.random.split(rng)
         init_x = jnp.zeros(env.observation_space(env_params).shape)
         network_params = network.init(_rng, init_x)
-        
-        param_count = sum(x.size for x in jax.tree.leaves(network_params))
-        print("PARAM COUNT", param_count)
-        
         if config["ANNEAL_LR"]:
             tx = optax.chain(
                 optax.clip_by_global_norm(config["MAX_GRAD_NORM"]),
diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py
index 7099247d..81d08e8a 100644
--- a/benchmarks/recursiongfn/main.py
+++ b/benchmarks/recursiongfn/main.py
@@ -92,13 +92,11 @@ def __init__(
         self.num_cond_dim = self.temperature_conditional.encoding_size()
 
     def _load_task_models(self):
-        xdg_cache = os.environ.get("XDG_CACHE_HOME")
+        xdg_cache = os.environ["XDG_CACHE_HOME"]
         model = bengio2021flow.load_original_model(
             cache=True,
             location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")),
         )
-        from benchmate.models import model_size
-        print(model_size(model))
         model.to(get_worker_device())
         model = self._wrap_model(model)
         return {"seh": model}
diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py
index 898a8cab..bf5b7ef6 100644
--- a/benchmarks/torchatari/main.py
+++ b/benchmarks/torchatari/main.py
@@ -201,10 +201,7 @@ def main():
     envs = RecordEpisodeStatistics(envs)
     assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported"
 
-
-    from benchmate.models import model_size
     agent = Agent(envs).to(device)
-    print(model_size(agent))
     optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
 
     # ALGO Logic: Storage setup
diff --git a/benchmate/benchmate/models.py b/benchmate/benchmate/models.py
deleted file mode 100644
index efd13e2c..00000000
--- a/benchmate/benchmate/models.py
+++ /dev/null
@@ -1,36 +0,0 @@
-
-
-def model_summary(model, input_shape):
-    try:
-        from torchsummary import summary
-        
-        summary(model, input_shape)
-    except:
-        print("Could not print summary")
-
-
-def model_size(model):
-    param_size = 0
-    param_count = 0
-    for param in model.parameters():
-        param_count += param.nelement()
-        param_size += param.nelement() * param.element_size()
-    
-    buffer_size = 0
-    buffer_count = 0
-    for buff in model.buffers():
-        buffer_count += buff.nelement()
-        buffer_size += buff.nelement() * buff.element_size()
-    
-    return {
-        "param": {
-            "count": param_count,
-            "size": param_size / 1024**2,
-            "unit": "MB"
-        },
-        "buffer": {
-            "count": buffer_count,
-            "size": buffer_size / 1024**2,
-            "unit": "MB"
-        }
-    }
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index 294d0c88..0ad34a3d 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -13,30 +13,7 @@
 from voir.instruments.io import io_monitor
 from voir.instruments.network import network_monitor
 from voir.instruments.monitor import monitor
-from voir.helpers import current_overseer
 
-from .metrics import sumggle_push, give_push, file_push
-
-
-def auto_push():
-    # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0))
-    mb_managed = int(os.getenv("MILABENCH_MANAGED", 0))
-
-    # Milabench managed: we need to push metrics to it
-    if mb_managed == 1:
-        # Using voir, DATA_FD is defined as well
-        ov = current_overseer.get()
-        if ov is not None:
-            return ov.give
-        
-        # Not using Voir, using structured stdout
-        if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1:
-            return sumggle_push()
-
-        raise RuntimeError("Could not find something to push to")
-
-    # Not using milabench; using stdout
-    return file_push()
 
 
 @instrument_definition
@@ -64,10 +41,16 @@ def monitor_node(ov, poll_interval=1, arch=None):
 
 
 def _smuggle_monitor(poll_interval=10, worker_init=None, **monitors):
-    log = auto_push()
-    
+    data_file = SmuggleWriter(sys.stdout)
     def mblog(data):
-        log(**data)
+        nonlocal data_file
+
+        if data_file is not None:
+            try:
+                print(json.dumps(data), file=data_file)
+            except ValueError:
+                pass
+                # print("Is bench ending?, ignoring ValueError")
     
     def get():
         t = time.time()
diff --git a/milabench/_version.py b/milabench/_version.py
index 5f55a16d..cdd2418d 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-129-ga60a3aa"
-__commit__ = "a60a3aae21e87e46bcce403620a3f56c12878554"
-__date__ = "2024-11-06 22:52:12 -0500"
+__tag__ = "v1.0.0_RC1-9-g6d1e1140"
+__commit__ = "6d1e114000cc4200ea307330032234db6696e40d"
+__date__ = "2024-09-30 14:39:43 -0400"
diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py
index 807a261e..f0402d29 100644
--- a/milabench/commands/executors.py
+++ b/milabench/commands/executors.py
@@ -32,9 +32,6 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals
     sized_args = scale_argv(pack, args)
     final_args = resolve_argv(pack, sized_args)
 
-    if use_stdout:
-        exec_env["MILABENCH_USE_STDOUT"] = "1"
-
     return await run(
         final_args,
         **kwargs,
diff --git a/milabench/pack.py b/milabench/pack.py
index 20feca39..1cdde093 100644
--- a/milabench/pack.py
+++ b/milabench/pack.py
@@ -335,8 +335,6 @@ def make_env(self):
             f"MILABENCH_DIR_{name.upper()}": path
             for name, path in self.config["dirs"].items()
         }
-        
-        env["MILABENCH_MANAGED"] = "1"
 
         env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}")
 
diff --git a/milabench/report.py b/milabench/report.py
index bdc4999c..c54ed8dd 100644
--- a/milabench/report.py
+++ b/milabench/report.py
@@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters):
     # Compute column size
     col_size = defaultdict(int)
     for index, row in df.iterrows():
-        col_size["bench"] = max(col_size["bench"], len(index), len("bench"))
+        col_size["bench"] = max(col_size["bench"], len(index))
         for col, val in zip(columns, row):
             fmt = formatters.get(col)
             if fmt is not None:
                 val = fmt(val)
-                col_size[col] = max(col_size[col], len(val), len(col))
+                col_size[col] = max(col_size[col], len(val))
 
     # Generate report
     sep = " | "

From 3d7d5f108d17a9ee8381b51eb0380964acf1e340 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 13:02:00 -0500
Subject: [PATCH 08/20] Tweaks

---
 benchmarks/flops/benchfile.py               |  8 +++-----
 benchmarks/flops/dev/extra/flops/mark_torch |  0
 benchmate/benchmate/monitor.py              | 13 +++++++++----
 milabench/commands/executors.py             |  3 +++
 milabench/report.py                         |  4 ++--
 5 files changed, 17 insertions(+), 11 deletions(-)
 delete mode 100644 benchmarks/flops/dev/extra/flops/mark_torch

diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py
index 9117caf0..3090a3c7 100644
--- a/benchmarks/flops/benchfile.py
+++ b/benchmarks/flops/benchfile.py
@@ -8,16 +8,14 @@ class FlopsBenchmarch(Package):
 
     def build_run_plan(self) -> "Command":
         import milabench.commands as cmd
-        
+        main = self.dirs.code / self.main_script
+        pack = cmd.PackCommand(self, *self.argv, lazy=True)
+            
         use_stdout = True
         
         if use_stdout:
-            main = self.dirs.code / self.main_script
-            pack = cmd.PackCommand(self, *self.argv, lazy=True)
             return pack.use_stdout()
         else:
-            main = self.dirs.code / self.main_script
-            pack = cmd.PackCommand(self, *self.argv, lazy=True)
             pack = cmd.VoirCommand(pack, cwd=main.parent)
             return pack
 
diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index 5c63796e..294d0c88 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -19,16 +19,21 @@
 
 
 def auto_push():
+    # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0))
+    mb_managed = int(os.getenv("MILABENCH_MANAGED", 0))
+
     # Milabench managed: we need to push metrics to it
-    if int(os.getenv("MILABENCH_MANAGED", 0)) == 1:
-        
+    if mb_managed == 1:
         # Using voir, DATA_FD is defined as well
         ov = current_overseer.get()
         if ov is not None:
             return ov.give
-
+        
         # Not using Voir, using structured stdout
-        return sumggle_push()
+        if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1:
+            return sumggle_push()
+
+        raise RuntimeError("Could not find something to push to")
 
     # Not using milabench; using stdout
     return file_push()
diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py
index f0402d29..807a261e 100644
--- a/milabench/commands/executors.py
+++ b/milabench/commands/executors.py
@@ -32,6 +32,9 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals
     sized_args = scale_argv(pack, args)
     final_args = resolve_argv(pack, sized_args)
 
+    if use_stdout:
+        exec_env["MILABENCH_USE_STDOUT"] = "1"
+
     return await run(
         final_args,
         **kwargs,
diff --git a/milabench/report.py b/milabench/report.py
index c54ed8dd..bdc4999c 100644
--- a/milabench/report.py
+++ b/milabench/report.py
@@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters):
     # Compute column size
     col_size = defaultdict(int)
     for index, row in df.iterrows():
-        col_size["bench"] = max(col_size["bench"], len(index))
+        col_size["bench"] = max(col_size["bench"], len(index), len("bench"))
         for col, val in zip(columns, row):
             fmt = formatters.get(col)
             if fmt is not None:
                 val = fmt(val)
-                col_size[col] = max(col_size[col], len(val))
+                col_size[col] = max(col_size[col], len(val), len(col))
 
     # Generate report
     sep = " | "

From 798a186891a466db96047bbbd6ee38076343175c Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 13:11:12 -0500
Subject: [PATCH 09/20] Hpu (#292)

* HPU changes

* HPU pins

* Use HPU device

* Gaudi Tweaks

* Remove pinnning

* call prepare again

* Add docker image for HPU

* Tweaks

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

* -

---------

Co-authored-by: Your Name <you@example.com>
---
 .pin/constraints-hpu-torch.txt                | 861 +++++++++++++-----
 benchmarks/brax/requirements.hpu.txt          | 108 +--
 benchmarks/diffusion/requirements.hpu.txt     | 381 ++++++++
 benchmarks/dinov2/requirements.hpu.txt        | 267 ++++++
 benchmarks/flops/requirements.hpu.txt         |  47 +-
 benchmarks/geo_gnn/requirements-pre.hpu.txt   |  99 ++
 benchmarks/geo_gnn/requirements.hpu.txt       | 321 +++++++
 benchmarks/huggingface/requirements.hpu.txt   |  62 +-
 benchmarks/lightning/main.py                  |  22 +-
 benchmarks/lightning/requirements.hpu.txt     | 285 ++++++
 benchmarks/llama/requirements.hpu.txt         |  88 +-
 benchmarks/llava/benchfile.py                 |   4 +-
 benchmarks/llava/main.py                      |   6 +-
 benchmarks/llava/requirements.hpu.txt         | 343 +++++++
 benchmarks/llm/configs/llama3_70B_full.yaml   |   8 +-
 .../llm/recipes/full_finetune_distributed.py  |  44 +-
 .../recipes/full_finetune_single_device.py    |  12 +-
 .../llm/recipes/lora_finetune_distributed.py  |  27 +-
 .../recipes/lora_finetune_single_device.py    |  31 +-
 .../ppo_full_finetune_single_device.py        |  10 +-
 benchmarks/llm/requirements.hpu.txt           | 408 +++++++++
 benchmarks/purejaxrl/requirements.hpu.txt     | 743 +++++++++++++++
 benchmarks/recursiongfn/requirements.hpu.txt  | 493 ++++++++++
 benchmarks/rlhf/main.py                       |  11 +
 benchmarks/rlhf/requirements.hpu.txt          | 362 ++++++++
 benchmarks/timm/requirements.hpu.txt          |  55 +-
 benchmarks/torchatari/requirements.hpu.txt    | 304 +++++++
 benchmarks/torchvision/requirements.hpu.txt   |  47 +-
 .../torchvision_ddp/requirements.hpu.txt      | 205 +++++
 benchmarks/vjepa/benchfile.py                 |   4 +-
 benchmarks/vjepa/main.py                      |  15 +-
 benchmarks/vjepa/requirements.hpu.txt         | 297 ++++++
 config/base.yaml                              |  22 +-
 constraints/extra/torch.hpu.txt               |   5 -
 constraints/hpu.txt                           |  16 +-
 docker/Dockerfile-hpu                         |  42 +
 docker/Makefile                               |  17 +
 milabench/_version.py                         |   6 +-
 milabench/remote.py                           |   2 +-
 milabench/system.py                           |   2 +-
 scripts/article/run_hpu.sh                    |  84 +-
 41 files changed, 5584 insertions(+), 582 deletions(-)
 create mode 100644 benchmarks/diffusion/requirements.hpu.txt
 create mode 100644 benchmarks/dinov2/requirements.hpu.txt
 create mode 100644 benchmarks/geo_gnn/requirements-pre.hpu.txt
 create mode 100644 benchmarks/geo_gnn/requirements.hpu.txt
 create mode 100644 benchmarks/lightning/requirements.hpu.txt
 create mode 100644 benchmarks/llava/requirements.hpu.txt
 create mode 100644 benchmarks/llm/requirements.hpu.txt
 create mode 100644 benchmarks/purejaxrl/requirements.hpu.txt
 create mode 100644 benchmarks/recursiongfn/requirements.hpu.txt
 create mode 100644 benchmarks/rlhf/requirements.hpu.txt
 create mode 100644 benchmarks/torchatari/requirements.hpu.txt
 create mode 100644 benchmarks/vjepa/requirements.hpu.txt
 create mode 100644 docker/Dockerfile-hpu
 create mode 100644 docker/Makefile

diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt
index 6481e8c6..92a55858 100644
--- a/.pin/constraints-hpu-torch.txt
+++ b/.pin/constraints-hpu-torch.txt
@@ -2,204 +2,359 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-hpu-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
+#    pip-compile --output-file=.pin/constraints-hpu-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llava/requirements.in benchmarks/llm/requirements.in benchmarks/purejaxrl/requirements.in benchmarks/recursiongfn/requirements.in benchmarks/rlhf/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in benchmarks/vjepa/requirements.in constraints/extra/torch.hpu.txt
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
 absl-py==2.1.0
     # via
     #   brax
     #   chex
+    #   distrax
     #   dm-env
     #   ml-collections
     #   mujoco
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
     #   tensorboard
-accelerate==0.32.1
-    # via -r benchmarks/accelerate_opt/requirements.in
-aiohttp==3.9.5
+    #   tensorflow-probability
+accelerate==0.34.2
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
+    #   diffusers
+    #   trl
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.8
     # via
     #   datasets
     #   fsspec
+    #   torch-geometric
 aiosignal==1.3.1
     # via aiohttp
-annotated-types==0.7.0
-    # via pydantic
 antlr4-python3-runtime==4.9.3
     # via omegaconf
+appdirs==1.4.4
+    # via cantilever
+argklass==1.4.4
+    # via
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+astroid==3.3.4
+    # via pylint
 asttokens==2.4.1
     # via giving
 async-timeout==4.0.3
     # via aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via aiohttp
-beautifulsoup4==4.12.3
-    # via gdown
+beartype==0.19.0
+    # via -r benchmarks/vjepa/requirements.in
+black==24.8.0
+    # via navix
 blinker==1.8.2
     # via flask
+blobfile==3.0.0
+    # via
+    #   -r benchmarks/llm/requirements.txt
+    #   torchtune
+blosc2==2.7.1
+    # via tables
+botorch==0.12.0
+    # via -r benchmarks/recursiongfn/requirements.in
+braceexpand==0.1.7
+    # via
+    #   -r benchmarks/vjepa/requirements.in
+    #   webdataset
 brax==0.10.5
-    # via -r benchmarks/brax/requirements.in
-certifi==2024.6.2
-    # via requests
+    # via
+    #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+cantilever==0.1.0
+    # via -r benchmarks/torchatari/requirements.in
+certifi==2024.8.30
+    # via
+    #   requests
+    #   sentry-sdk
 charset-normalizer==3.3.2
     # via requests
-chex==0.1.86
-    # via optax
+chex==0.1.87
+    # via
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   optax
+    #   rlax
 click==8.1.7
-    # via flask
+    # via
+    #   black
+    #   flask
+    #   wandb
 cloudpickle==3.0.0
-    # via gym
-codefind==0.1.6
+    # via
+    #   gym
+    #   gymnasium
+    #   submitit
+    #   tensorflow-probability
+codefind==0.1.7
     # via ptera
 contextlib2==21.6.0
     # via ml-collections
-datasets==2.20.0
-    # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+contourpy==1.3.0
+    # via matplotlib
+cvxopt==1.3.2
+    # via -r benchmarks/recursiongfn/requirements.in
+cycler==0.12.1
+    # via matplotlib
+datasets==3.0.1
+    # via
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   evaluate
-deepspeed==0.14.4
-    # via -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
+    #   torchtune
+    #   trl
+decorator==5.1.1
+    # via tensorflow-probability
+decord==0.6.0
+    # via -r benchmarks/vjepa/requirements.in
+diffusers[torch]==0.30.3
+    # via -r benchmarks/diffusion/requirements.in
 dill==0.3.8
     # via
     #   datasets
-    #   evaluate
     #   multiprocess
+    #   pylint
+distrax==0.1.5
+    # via
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   rlax
 dm-env==1.6
-    # via brax
+    # via
+    #   brax
+    #   envpool
+    #   rlax
 dm-tree==0.1.8
-    # via dm-env
-docker==7.1.0
-    # via torchx
+    # via
+    #   dm-env
+    #   tensorflow-probability
+docker-pycreds==0.4.0
+    # via wandb
 docstring-parser==0.16
-    # via torchx
-etils[epath,epy]==1.7.0
+    # via tyro
+dotmap==1.3.30
+    # via evosax
+einops==0.8.0
+    # via -r benchmarks/vjepa/requirements.in
+envpool==0.8.4
+    # via -r benchmarks/torchatari/requirements.in
+etils[epath,epy]==1.9.4
     # via
     #   brax
     #   mujoco
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-evaluate==0.4.2
-    # via -r benchmarks/accelerate_opt/requirements.in
-executing==1.2.0
+evosax==0.1.6
+    # via -r benchmarks/purejaxrl/requirements.in
+exceptiongroup==1.2.2
+    # via pytest
+executing==2.1.0
     # via varname
 fairscale==0.4.13
-    # via -r benchmarks/llama/requirements.in
-fbgemm-gpu==0.7.0
-    # via torchrec
-filelock==3.15.4
     # via
+    #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+farama-notifications==0.0.4
+    # via gymnasium
+filelock==3.16.1
+    # via
+    #   blobfile
     #   datasets
-    #   gdown
+    #   diffusers
     #   huggingface-hub
     #   torch
-    #   torchx
     #   transformers
     #   triton
-fire==0.6.0
-    # via -r benchmarks/llama/requirements.in
+fire==0.7.0
+    # via
+    #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+flake8==7.1.1
+    # via navix
+flashbax==0.1.2
+    # via -r benchmarks/purejaxrl/requirements.in
 flask==3.0.3
     # via
     #   brax
     #   flask-cors
-flask-cors==4.0.1
-    # via brax
-flax==0.8.5
+flask-cors==5.0.0
     # via brax
+flax==0.9.0
+    # via
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   navix
+fonttools==4.54.1
+    # via matplotlib
 frozenlist==1.4.1
     # via
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   datasets
     #   etils
-    #   evaluate
     #   huggingface-hub
+    #   lightning
+    #   pytorch-lightning
     #   torch
-    #   torchx
-future==1.0.0
-    # via -r benchmarks/dlrm/requirements.in
-gdown==5.2.0
-    # via -r benchmarks/stargan/requirements.in
-giving==0.4.2
+    #   torch-geometric
+fvcore==0.1.5.post20221221
+    # via -r benchmarks/dinov2/requirements.in
+gast==0.6.0
+    # via tensorflow-probability
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   wandb
+giving==0.4.3
     # via
     #   ptera
     #   voir
 glfw==2.7.0
     # via mujoco
-graphviz==0.20.3
-    # via torchviz
-grpcio==1.65.1
+gpytorch==1.13
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+grpcio==1.66.2
     # via
     #   brax
     #   tensorboard
 gym==0.26.2
-    # via brax
+    # via
+    #   -r benchmarks/torchatari/requirements.in
+    #   brax
+    #   envpool
+    #   gymnax
 gym-notices==0.0.8
     # via gym
+gymnasium==0.29.1
+    # via
+    #   envpool
+    #   gymnax
+gymnax==0.0.8
+    # via
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 hjson==3.1.0
-    # via deepspeed
-huggingface-hub==0.24.0
+    # via argklass
+huggingface-hub==0.25.1
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
-    #   evaluate
+    #   diffusers
+    #   timm
     #   tokenizers
+    #   torchtune
     #   transformers
-idna==3.7
+humanize==4.10.0
+    # via orbax-checkpoint
+idna==3.10
     # via
     #   requests
     #   yarl
-importlib-metadata==8.0.0
-    # via torchx
-importlib-resources==6.4.0
+importlib-metadata==8.5.0
+    # via diffusers
+importlib-resources==6.4.5
     # via
+    #   argklass
+    #   cantilever
     #   etils
     #   torchcompat
+iniconfig==2.0.0
+    # via pytest
+iopath==0.1.10
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
+isort==5.13.2
+    # via pylint
 itsdangerous==2.2.0
     # via flask
-jax[cuda12]==0.4.28
+jax==0.4.33
     # via
     #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
     #   flax
+    #   gymnax
     #   jaxopt
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.28
-    # via jax-cuda12-plugin
-jax-cuda12-plugin==0.4.28
-    # via jax
-jaxlib==0.4.28+cuda12.cudnn89
+    #   rlax
+jaxlib==0.4.33
     # via
     #   brax
     #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
     #   jax
     #   jaxopt
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
 jaxopt==0.8.3
     # via brax
+jaxtyping==0.2.19
+    # via
+    #   gpytorch
+    #   linear-operator
 jinja2==3.1.4
     # via
     #   brax
     #   flask
     #   torch
+    #   torch-geometric
 joblib==1.4.2
     # via scikit-learn
-lightning-utilities==0.11.5
-    # via torchmetrics
-markdown==3.6
+kiwisolver==1.4.7
+    # via matplotlib
+lightning==2.4.0
+    # via -r benchmarks/lightning/requirements.in
+lightning-utilities==0.11.7
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+linear-operator==0.5.3
+    # via
+    #   botorch
+    #   gpytorch
+lxml==5.3.0
+    # via blobfile
+markdown==3.7
     # via tensorboard
 markdown-it-py==3.0.0
     # via rich
@@ -207,410 +362,634 @@ markupsafe==2.1.5
     # via
     #   jinja2
     #   werkzeug
+matplotlib==3.9.2
+    # via
+    #   evosax
+    #   gymnax
+    #   seaborn
+mccabe==0.7.0
+    # via
+    #   flake8
+    #   pylint
 mdurl==0.1.2
     # via markdown-it-py
 ml-collections==0.1.1
     # via brax
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   jax
     #   jaxlib
     #   tensorstore
 mpmath==1.3.0
-    # via sympy
-msgpack==1.0.8
     # via
+    #   botorch
+    #   gpytorch
+    #   linear-operator
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   blosc2
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.3
     # via
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.3
     # via brax
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   aiohttp
     #   yarl
+multipledispatch==1.0.0
+    # via botorch
 multiprocess==0.70.16
-    # via
-    #   datasets
-    #   evaluate
+    # via datasets
 mypy-extensions==1.0.0
-    # via typing-inspect
+    # via black
+navix==0.7.0
+    # via -r benchmarks/purejaxrl/requirements.in
+ndindex==1.9.2
+    # via blosc2
 nest-asyncio==1.6.0
     # via orbax-checkpoint
 networkx==3.3
-    # via torch
-ninja==1.11.1.1
-    # via deepspeed
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torch
+numexpr==2.10.1
+    # via
+    #   blosc2
+    #   tables
 numpy==1.26.4
     # via
-    #   -r benchmarks/dlrm/requirements.in
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
+    #   blosc2
     #   brax
     #   chex
+    #   contourpy
     #   datasets
-    #   deepspeed
+    #   decord
+    #   diffusers
+    #   distrax
     #   dm-env
-    #   evaluate
+    #   envpool
+    #   evosax
     #   fairscale
-    #   fbgemm-gpu
-    #   flax
+    #   flashbax
+    #   fvcore
     #   gym
+    #   gymnasium
     #   jax
     #   jaxlib
     #   jaxopt
+    #   jaxtyping
+    #   matplotlib
     #   ml-dtypes
     #   mujoco
-    #   onnx
+    #   navix
+    #   numexpr
     #   opencv-python
-    #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   pandas
     #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   rlax
     #   scikit-learn
     #   scipy
+    #   seaborn
+    #   tables
     #   tensorboard
     #   tensorboardx
+    #   tensorflow-probability
     #   tensorstore
+    #   torch-geometric
     #   torchmetrics
+    #   torchtune
     #   torchvision
     #   transformers
     #   trimesh
+    #   trl
+    #   webdataset
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
-    #   jax
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
-    # via
-    #   jax
-    #   torch
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via
-    #   jax
-    #   jax-cuda12-plugin
+    # via torch
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via torch
 nvidia-cuda-runtime-cu12==12.1.105
-    # via
-    #   jax
-    #   torch
-nvidia-cudnn-cu12==8.9.2.26
-    # via
-    #   jax
-    #   torch
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
 nvidia-cufft-cu12==11.0.2.54
-    # via
-    #   jax
-    #   torch
+    # via torch
 nvidia-curand-cu12==10.3.2.106
     # via torch
 nvidia-cusolver-cu12==11.4.5.107
-    # via
-    #   jax
-    #   torch
+    # via torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
-    #   jax
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-ml-py==12.555.43
-    # via deepspeed
+nvidia-ml-py==12.560.30
+    # via voir
 nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.6.77
     # via
-    #   jax
-    #   torch
-nvidia-nvjitlink-cu12==12.5.82
-    # via
-    #   jax
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
     # via torch
 omegaconf==2.3.0
-    # via voir
-onnx==1.16.1
-    # via -r benchmarks/dlrm/requirements.in
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torchtune
+    #   voir
 opencv-python==4.10.0.84
-    # via -r benchmarks/super-slomo/requirements.in
-opt-einsum==3.3.0
-    # via jax
+    # via -r benchmarks/vjepa/requirements.in
+opt-einsum==3.4.0
+    # via
+    #   jax
+    #   pyro-ppl
 optax==0.2.3
     # via
+    #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   flax
-orbax-checkpoint==0.5.21
+optree==0.13.0
+    # via envpool
+orbax-checkpoint==0.6.4
     # via
     #   brax
     #   flax
-ovld==0.3.5
+ovld==0.3.9
     # via voir
 packaging==24.1
     # via
     #   accelerate
+    #   black
     #   datasets
-    #   deepspeed
-    #   evaluate
+    #   envpool
     #   huggingface-hub
+    #   lightning
     #   lightning-utilities
+    #   matplotlib
+    #   pytest
+    #   pytorch-lightning
+    #   setuptools-scm
+    #   tables
+    #   tensorboard
     #   tensorboardx
     #   torchmetrics
     #   transformers
-pandas==2.2.2
+pandas==2.2.3
     # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   datasets
-    #   evaluate
+    #   seaborn
+pathspec==0.12.1
+    # via black
 pillow==10.4.0
     # via
+    #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/llava/requirements.in
     #   brax
+    #   diffusers
+    #   fvcore
+    #   matplotlib
+    #   navix
+    #   rdkit
     #   torchvision
-protobuf==4.25.3
+platformdirs==4.3.6
+    # via
+    #   black
+    #   pylint
+    #   wandb
+pluggy==1.5.0
+    # via pytest
+portalocker==2.10.1
+    # via iopath
+protobuf==5.28.2
     # via
-    #   onnx
     #   orbax-checkpoint
     #   tensorboard
     #   tensorboardx
+    #   wandb
 psutil==5.9.8
     # via
     #   accelerate
-    #   deepspeed
+    #   torch-geometric
     #   voir
+    #   wandb
 ptera==1.4.1
     # via voir
 py-cpuinfo==9.0.0
-    # via deepspeed
+    # via
+    #   blosc2
+    #   tables
 pyarrow==17.0.0
-    # via datasets
-pyarrow-hotfix==0.6
-    # via datasets
-pydantic==2.7.4
-    # via deepspeed
-pydantic-core==2.18.4
-    # via pydantic
-pydot==3.0.1
-    # via -r benchmarks/dlrm/requirements.in
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   datasets
+pycodestyle==2.12.1
+    # via flake8
+pycryptodomex==3.21.0
+    # via blobfile
+pyflakes==3.2.0
+    # via flake8
 pygments==2.18.0
     # via rich
-pynvml==11.5.3
-    # via voir
+pylint==3.3.1
+    # via navix
 pyopengl==3.1.7
     # via mujoco
-pyparsing==3.1.2
-    # via pydot
-pyre-extensions==0.0.30
-    # via torchx
-pysocks==1.7.1
-    # via requests
+pyparsing==3.1.4
+    # via
+    #   matplotlib
+    #   torch-geometric
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+pytest==8.3.3
+    # via navix
 python-dateutil==2.9.0.post0
-    # via pandas
+    # via
+    #   matplotlib
+    #   pandas
 pytinyrenderer==0.0.14
     # via brax
-pytz==2024.1
+pytorch-lightning==2.4.0
+    # via lightning
+pytz==2024.2
     # via pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
+    #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
     #   datasets
+    #   evosax
     #   flax
+    #   fvcore
+    #   gymnax
     #   huggingface-hub
+    #   lightning
     #   ml-collections
     #   omegaconf
     #   orbax-checkpoint
-    #   torchx
+    #   pytorch-lightning
+    #   timm
     #   transformers
+    #   wandb
+    #   webdataset
+    #   yacs
+rdkit==2024.3.5
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
 reactivex==4.0.4
     # via giving
-regex==2024.5.15
-    # via transformers
-requests[socks]==2.32.3
+regex==2024.9.11
+    # via
+    #   diffusers
+    #   tiktoken
+    #   transformers
+requests==2.32.3
     # via
     #   datasets
-    #   docker
-    #   evaluate
-    #   gdown
+    #   diffusers
     #   huggingface-hub
+    #   tiktoken
+    #   torch-geometric
     #   transformers
-rich==13.7.1
+    #   wandb
+rich==13.9.1
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   flax
+    #   tyro
     #   voir
-safetensors==0.4.3
+rlax==0.1.6
+    # via navix
+safetensors==0.4.5
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
+    #   diffusers
+    #   timm
+    #   torchtune
     #   transformers
-scikit-learn==1.5.1
-    # via -r benchmarks/dlrm/requirements.in
-scipy==1.14.0
+scikit-learn==1.5.2
+    # via gpytorch
+scipy==1.14.1
     # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
     #   brax
+    #   gpytorch
     #   jax
     #   jaxlib
     #   jaxopt
+    #   linear-operator
     #   mujoco-mjx
     #   scikit-learn
+    #   torch-cluster
+    #   torch-sparse
+seaborn==0.13.2
+    # via gymnax
 sentencepiece==0.2.0
-    # via -r benchmarks/llama/requirements.in
+    # via
+    #   -r benchmarks/llama/requirements.in
+    #   torchtune
+sentry-sdk==2.15.0
+    # via wandb
+setproctitle==1.3.3
+    # via wandb
+setuptools-scm==8.1.0
+    # via navix
+shtab==1.7.1
+    # via tyro
 six==1.16.0
     # via
     #   asttokens
-    #   fire
+    #   docker-pycreds
     #   ml-collections
     #   python-dateutil
     #   tensorboard
-soupsieve==2.5
-    # via beautifulsoup4
-sympy==1.13.0
+    #   tensorflow-probability
+smmap==5.0.1
+    # via gitdb
+submitit==1.5.2
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+sympy==1.13.3
     # via torch
+tables==3.10.1
+    # via -r benchmarks/recursiongfn/requirements.in
 tabulate==0.9.0
-    # via torchx
-tensorboard==2.17.0
-    # via -r benchmarks/dlrm/requirements.in
+    # via fvcore
+tensorboard==2.18.0
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
 tensorboard-data-server==0.7.2
     # via tensorboard
 tensorboardx==2.6.2.2
     # via brax
-tensorstore==0.1.63
+tensorflow-probability==0.24.0
+    # via distrax
+tensorstore==0.1.66
     # via
+    #   flashbax
     #   flax
     #   orbax-checkpoint
 termcolor==2.4.0
-    # via fire
+    # via
+    #   fire
+    #   fvcore
 threadpoolctl==3.5.0
     # via scikit-learn
+tiktoken==0.7.0
+    # via torchtune
+timm==1.0.9
+    # via -r benchmarks/vjepa/requirements.in
 tokenizers==0.19.1
     # via transformers
+tomli==2.0.2
+    # via
+    #   black
+    #   pylint
+    #   pytest
+    #   setuptools-scm
+tomlkit==0.13.2
+    # via pylint
 toolz==0.12.1
     # via chex
-torch==2.3.1
+torch==2.4.1
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/geo_gnn/requirements-pre.in
     #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
-    #   deepspeed
+    #   botorch
+    #   diffusers
     #   fairscale
-    #   torchaudio
+    #   lightning
+    #   linear-operator
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   timm
     #   torchmetrics
     #   torchvision
-    #   torchviz
-torchaudio==2.3.1
-    # via -r benchmarks/accelerate_opt/requirements.in
+    #   trl
+    #   xformers
+torch-cluster==1.6.3
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.6.1
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-scatter==2.1.2
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-sparse==0.6.18
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torchao==0.3.1
+    # via
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llm/requirements.in
+    #   torchtune
 torchcompat==1.1.4
     # via
     #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/lightning/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchmetrics==1.0.3
-    # via torchrec
-torchrec==0.7.0
-    # via -r benchmarks/dlrm/requirements.in
-torchvision==0.18.1
+torchmetrics==1.4.2
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   lightning
+    #   pytorch-lightning
+torchtune==0.2.1
+    # via
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llm/requirements.in
+torchvision==0.19.1
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchviz==0.0.2
-    # via -r benchmarks/dlrm/requirements.in
-torchx==0.7.0
-    # via -r benchmarks/dlrm/requirements.in
-tqdm==4.66.4
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+tqdm==4.66.5
     # via
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/flops/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
     #   datasets
-    #   deepspeed
-    #   evaluate
-    #   gdown
+    #   fvcore
     #   huggingface-hub
-    #   torchrec
+    #   iopath
+    #   lightning
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   torch-geometric
+    #   torchtune
     #   transformers
-transformers==4.42.4
+transformers==4.44.2
     # via
-    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/llama/requirements.in
-trimesh==4.4.3
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+trimesh==4.4.9
     # via
     #   brax
     #   mujoco-mjx
-triton==2.3.1
+triton==3.0.0
     # via torch
+trl==0.10.1
+    # via
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/rlhf/requirements.in
+typeguard==4.3.0
+    # via jaxtyping
+types-protobuf==5.28.0.20240924
+    # via envpool
 typing-extensions==4.12.2
     # via
+    #   astroid
+    #   black
+    #   botorch
     #   brax
     #   chex
+    #   envpool
     #   etils
+    #   flashbax
     #   flax
+    #   gymnasium
     #   huggingface-hub
+    #   iopath
+    #   jaxtyping
+    #   lightning
     #   lightning-utilities
+    #   multidict
+    #   navix
+    #   optree
     #   orbax-checkpoint
-    #   pydantic
-    #   pydantic-core
-    #   pyre-extensions
+    #   pytorch-lightning
     #   reactivex
+    #   rich
+    #   submitit
+    #   tables
     #   torch
-    #   typing-inspect
-typing-inspect==0.9.0
-    # via pyre-extensions
-tzdata==2024.1
+    #   typeguard
+    #   tyro
+tyro==0.8.11
+    # via
+    #   -r benchmarks/torchatari/requirements.in
+    #   navix
+    #   trl
+tzdata==2024.2
     # via pandas
-urllib3==1.26.19
+urllib3==2.2.3
     # via
-    #   docker
+    #   blobfile
     #   requests
-    #   torchx
-varname==0.10.0
+    #   sentry-sdk
+varname==0.13.3
     # via giving
 voir==0.2.19
     # via
     #   -c .pin/../constraints/hpu.txt
-    #   -r benchmarks/accelerate_opt/requirements.in
     #   -r benchmarks/brax/requirements.in
-    #   -r benchmarks/dlrm/requirements.in
+    #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/geo_gnn/requirements.in
     #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-werkzeug==3.0.3
+    #   -r benchmarks/vjepa/requirements.in
+wandb==0.18.3
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   navix
+webdataset==0.2.100
+    # via -r benchmarks/vjepa/requirements.in
+werkzeug==3.0.4
     # via
     #   flask
     #   tensorboard
-xxhash==3.4.1
-    # via
-    #   datasets
-    #   evaluate
-yarl==1.9.4
+xformers==0.0.28.post1
+    # via -r benchmarks/dinov2/requirements.in
+xxhash==3.5.0
+    # via datasets
+yacs==0.1.8
+    # via fvcore
+yarl==1.13.1
     # via aiohttp
-zipp==3.19.2
+zipp==3.20.2
     # via
     #   etils
     #   importlib-metadata
diff --git a/benchmarks/brax/requirements.hpu.txt b/benchmarks/brax/requirements.hpu.txt
index cae1147c..b02ff745 100644
--- a/benchmarks/brax/requirements.hpu.txt
+++ b/benchmarks/brax/requirements.hpu.txt
@@ -4,10 +4,6 @@
 #
 #    pip-compile --output-file=benchmarks/brax/requirements.hpu.txt .pin/tmp-constraints-hpu-brax.txt benchmarks/brax/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
 absl-py==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -35,7 +31,7 @@ brax==0.10.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/brax/requirements.in
-chex==0.1.86
+chex==0.1.87
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   optax
@@ -47,7 +43,7 @@ cloudpickle==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   gym
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
@@ -63,7 +59,7 @@ dm-tree==0.1.8
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   dm-env
-etils[epath,epy]==1.7.0
+etils[epath,epy]==1.9.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
@@ -71,11 +67,11 @@ etils[epath,epy]==1.7.0
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -85,20 +81,20 @@ flask==3.0.3
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
     #   flask-cors
-flask-cors==4.0.1
+flask-cors==5.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
-flax==0.8.5
+flax==0.9.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   etils
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
@@ -107,7 +103,7 @@ glfw==2.7.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   mujoco
-grpcio==1.65.1
+grpcio==1.66.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
@@ -119,7 +115,11 @@ gym-notices==0.0.8
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   gym
-importlib-resources==6.4.0
+humanize==4.10.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   orbax-checkpoint
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   etils
@@ -127,7 +127,7 @@ itsdangerous==2.2.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   flask
-jax[cuda12]==0.4.28
+jax==0.4.33
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/brax/requirements.in
@@ -138,15 +138,7 @@ jax[cuda12]==0.4.28
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jax-cuda12-pjrt==0.4.28
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax-cuda12-plugin
-jax-cuda12-plugin==0.4.28
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
-jaxlib==0.4.28+cuda12.cudnn89
+jaxlib==0.4.33
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
@@ -183,7 +175,7 @@ ml-collections==0.1.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   jax
@@ -193,17 +185,17 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   sympy
-msgpack==1.0.8
+msgpack==1.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.0
+mujoco==3.2.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.0
+mujoco-mjx==3.2.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
@@ -221,14 +213,12 @@ numpy==1.26.4
     #   brax
     #   chex
     #   dm-env
-    #   flax
     #   gym
     #   jax
     #   jaxlib
     #   jaxopt
     #   ml-dtypes
     #   mujoco
-    #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   scipy
@@ -238,19 +228,13 @@ numpy==1.26.4
 nvidia-cublas-cu12==12.1.3.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   torch
-nvidia-cuda-nvcc-cu12==12.5.82
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -258,17 +242,14 @@ nvidia-cuda-nvrtc-cu12==12.1.105
 nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   torch
 nvidia-curand-cu12==10.3.2.106
     # via
@@ -277,23 +258,23 @@ nvidia-curand-cu12==10.3.2.106
 nvidia-cusolver-cu12==11.4.5.107
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   torch
-nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvjitlink-cu12==12.6.77
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   jax
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
@@ -304,7 +285,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-opt-einsum==3.3.0
+opt-einsum==3.4.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   jax
@@ -313,12 +294,12 @@ optax==0.2.3
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
     #   flax
-orbax-checkpoint==0.5.21
+orbax-checkpoint==0.6.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
     #   flax
-ovld==0.3.5
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -330,7 +311,7 @@ pillow==10.4.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
-protobuf==4.25.3
+protobuf==5.28.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   orbax-checkpoint
@@ -347,10 +328,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   voir
 pyopengl==3.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -359,7 +336,7 @@ pytinyrenderer==0.0.14
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   flax
@@ -370,12 +347,12 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   flax
     #   voir
-scipy==1.14.0
+scipy==1.14.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
@@ -388,7 +365,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   asttokens
     #   ml-collections
-sympy==1.13.0
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -396,7 +373,7 @@ tensorboardx==2.6.2.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
-tensorstore==0.1.63
+tensorstore==0.1.66
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   flax
@@ -405,16 +382,16 @@ toolz==0.12.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   chex
-torch==2.3.1
+torch==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/brax/requirements.in
-trimesh==4.4.3
+trimesh==4.4.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   brax
     #   mujoco-mjx
-triton==2.3.1
+triton==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -427,8 +404,9 @@ typing-extensions==4.12.2
     #   flax
     #   orbax-checkpoint
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
@@ -437,11 +415,11 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/brax/requirements.in
-werkzeug==3.0.3
+werkzeug==3.0.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   flask
-zipp==3.19.2
+zipp==3.20.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   etils
diff --git a/benchmarks/diffusion/requirements.hpu.txt b/benchmarks/diffusion/requirements.hpu.txt
new file mode 100644
index 00000000..88ccd569
--- /dev/null
+++ b/benchmarks/diffusion/requirements.hpu.txt
@@ -0,0 +1,381 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/diffusion/requirements.hpu.txt .pin/tmp-constraints-hpu-diffusion-nodes.txt benchmarks/diffusion/requirements.in
+#
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+    #   diffusers
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+datasets==3.0.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+diffusers[torch]==0.30.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   multiprocess
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   argklass
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   yarl
+importlib-metadata==8.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   diffusers
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   argklass
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   pandas
+    #   pyarrow
+    #   torchvision
+    #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   diffusers
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   diffusers
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   transformers
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   diffusers
+    #   transformers
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   transformers
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   diffusers
+    #   torchvision
+torchvision==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/diffusion/requirements.in
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/diffusion/requirements.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/diffusion/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+zipp==3.20.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   importlib-metadata
diff --git a/benchmarks/dinov2/requirements.hpu.txt b/benchmarks/dinov2/requirements.hpu.txt
new file mode 100644
index 00000000..4a11ccfb
--- /dev/null
+++ b/benchmarks/dinov2/requirements.hpu.txt
@@ -0,0 +1,267 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/dinov2/requirements.hpu.txt .pin/tmp-constraints-hpu-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in
+#
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   submitit
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+fvcore==0.1.5.post20221221
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+iopath==0.1.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   fvcore
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+lightning-utilities==0.11.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchmetrics
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+    #   scipy
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning-utilities
+    #   torchmetrics
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+    #   torchvision
+portalocker==2.10.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   iopath
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+    #   omegaconf
+    #   yacs
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+submitit==1.5.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tabulate==0.9.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+termcolor==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+torchmetrics==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+torchvision==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+    #   iopath
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   iopath
+    #   lightning-utilities
+    #   reactivex
+    #   rich
+    #   submitit
+    #   torch
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/dinov2/requirements.in
+xformers==0.0.28.post1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/dinov2/requirements.in
+yacs==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fvcore
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/flops/requirements.hpu.txt b/benchmarks/flops/requirements.hpu.txt
index 77595d5f..91e5677f 100644
--- a/benchmarks/flops/requirements.hpu.txt
+++ b/benchmarks/flops/requirements.hpu.txt
@@ -4,10 +4,6 @@
 #
 #    pip-compile --output-file=benchmarks/flops/requirements.hpu.txt .pin/tmp-constraints-hpu-flops.txt benchmarks/flops/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
 antlr4-python3-runtime==4.9.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -16,29 +12,29 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torchcompat
@@ -88,7 +84,7 @@ nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -109,11 +105,15 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvjitlink-cu12==12.6.77
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
@@ -126,7 +126,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -146,11 +146,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   omegaconf
@@ -158,7 +154,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -166,11 +162,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-torch==2.3.1
+torch==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/flops/requirements.in
@@ -180,15 +176,15 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/flops/requirements.in
-torchvision==0.18.1
+torchvision==0.19.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/flops/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/flops/requirements.in
-triton==2.3.1
+triton==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -196,8 +192,9 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
diff --git a/benchmarks/geo_gnn/requirements-pre.hpu.txt b/benchmarks/geo_gnn/requirements-pre.hpu.txt
new file mode 100644
index 00000000..db910c1a
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements-pre.hpu.txt
@@ -0,0 +1,99 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.hpu.txt .pin/tmp-constraints-hpu-dimenet.txt benchmarks/geo_gnn/requirements-pre.in
+#
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
diff --git a/benchmarks/geo_gnn/requirements.hpu.txt b/benchmarks/geo_gnn/requirements.hpu.txt
new file mode 100644
index 00000000..9c6bb6d6
--- /dev/null
+++ b/benchmarks/geo_gnn/requirements.hpu.txt
@@ -0,0 +1,321 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements.hpu.txt .pin/tmp-constraints-hpu-dimenet.txt benchmarks/geo_gnn/requirements-pre.hpu.txt benchmarks/geo_gnn/requirements.in
+#
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+    #   torch-geometric
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+    #   torch-geometric
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   pandas
+    #   rdkit
+    #   scipy
+    #   torch-geometric
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rdkit
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pyparsing==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+rdkit==2024.3.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-cluster
+    #   torch-sparse
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+torch-cluster==1.6.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-geometric==2.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-scatter==2.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+torch-sparse==0.6.18
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.hpu.txt
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/geo_gnn/requirements.in
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
diff --git a/benchmarks/huggingface/requirements.hpu.txt b/benchmarks/huggingface/requirements.hpu.txt
index a504cba1..b5e21d99 100644
--- a/benchmarks/huggingface/requirements.hpu.txt
+++ b/benchmarks/huggingface/requirements.hpu.txt
@@ -4,10 +4,6 @@
 #
 #    pip-compile --output-file=benchmarks/huggingface/requirements.hpu.txt .pin/tmp-constraints-hpu-hf.txt benchmarks/huggingface/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
 antlr4-python3-runtime==4.9.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -16,7 +12,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-certifi==2024.6.2
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
@@ -24,37 +20,37 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   torch
     #   transformers
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.0
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
@@ -104,7 +100,7 @@ nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -125,11 +121,15 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvjitlink-cu12==12.6.77
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
@@ -142,7 +142,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -151,6 +151,10 @@ packaging==24.1
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   transformers
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/huggingface/requirements.in
 psutil==5.9.8
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -163,11 +167,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
@@ -177,7 +177,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-regex==2024.5.15
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   transformers
@@ -186,11 +186,11 @@ requests==2.32.3
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   transformers
@@ -198,7 +198,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -206,20 +206,21 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   transformers
-torch==2.3.1
+torch==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/huggingface/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   transformers
-transformers==4.42.4
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/huggingface/requirements.in
-triton==2.3.1
+triton==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -228,12 +229,13 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   reactivex
+    #   rich
     #   torch
-urllib3==1.26.19
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
diff --git a/benchmarks/lightning/main.py b/benchmarks/lightning/main.py
index aca89ee4..4c3d1206 100644
--- a/benchmarks/lightning/main.py
+++ b/benchmarks/lightning/main.py
@@ -1,14 +1,17 @@
 #!/usr/bin/env python
 
+
 import argparse
 import os
 
+# FIXME this is HPU only
+os.environ["PT_HPU_LAZY_MODE"] = str(int(int(os.getenv("WORLD_SIZE", -1)) <= 0))
+
 import torch
 import torch.nn.functional as F
 import lightning as L
 import torchvision.models as torchvision_models
 
-import torchcompat.core as accelerator
 from benchmate.dataloader import imagenet_dataloader, dataloader_arguments
 
 
@@ -37,7 +40,7 @@ def configure_optimizers(self):
 def prepare_voir():
     from benchmate.observer import BenchObserver
     from benchmate.monitor import bench_monitor
-
+    import torchcompat.core as accelerator
     observer = BenchObserver(
         accelerator.Event, 
         earlystop=100,
@@ -49,6 +52,10 @@ def prepare_voir():
     return observer, bench_monitor
 
 def main():
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", 1))
+    
     parser = argparse.ArgumentParser(description='simple distributed training job')
     parser.add_argument(
         "--epochs",
@@ -64,11 +71,10 @@ def main():
     args = parser.parse_args()
     model = getattr(torchvision_models, args.model)()
 
-    rank = int(os.getenv("RANK", 0))
-    world_size = int(os.getenv("WORLD_SIZE", 1))
-    local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", 1))
-
+    import torchcompat.core as accelerator
+  
     n = accelerator.device_count()
+    n = local_world_size
     nnodes = world_size // local_world_size
 
     model = TorchvisionLightning(model)
@@ -83,9 +89,9 @@ def main():
         accelerator="auto", 
         devices=n, 
         num_nodes=nnodes, 
-        strategy="ddp",
+        strategy="auto",
         max_epochs=args.epochs,
-        precision="16-mixed",
+        precision="bf16-mixed",
         enable_checkpointing=False,
         enable_progress_bar=False,
         reload_dataloaders_every_n_epochs=1,
diff --git a/benchmarks/lightning/requirements.hpu.txt b/benchmarks/lightning/requirements.hpu.txt
new file mode 100644
index 00000000..f86fb064
--- /dev/null
+++ b/benchmarks/lightning/requirements.hpu.txt
@@ -0,0 +1,285 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/lightning/requirements.hpu.txt .pin/tmp-constraints-hpu-lightning-gpus.txt benchmarks/lightning/requirements.in
+#
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   yarl
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+lightning==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+lightning-utilities==0.11.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchmetrics
+    #   torchvision
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   torchmetrics
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pytorch-lightning==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   omegaconf
+    #   pytorch-lightning
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+    #   torchvision
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/lightning/requirements.in
+torchmetrics==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+torchvision==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/lightning/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   pytorch-lightning
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   lightning
+    #   lightning-utilities
+    #   multidict
+    #   pytorch-lightning
+    #   reactivex
+    #   rich
+    #   torch
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/lightning/requirements.in
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/llama/requirements.hpu.txt b/benchmarks/llama/requirements.hpu.txt
index 2368c150..9c01a4dd 100644
--- a/benchmarks/llama/requirements.hpu.txt
+++ b/benchmarks/llama/requirements.hpu.txt
@@ -4,11 +4,11 @@
 #
 #    pip-compile --output-file=benchmarks/llama/requirements.hpu.txt .pin/tmp-constraints-hpu-llm.txt benchmarks/llama/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
-aiohttp==3.9.5
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
@@ -29,11 +29,11 @@ async-timeout==4.0.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   aiohttp
-attrs==23.2.0
+attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   aiohttp
-certifi==2024.6.2
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
@@ -41,11 +41,11 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
-datasets==2.20.0
+datasets==3.0.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -54,7 +54,7 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   varname
@@ -62,7 +62,7 @@ fairscale==0.4.13
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/llama/requirements.in
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
@@ -70,7 +70,7 @@ filelock==3.15.4
     #   torch
     #   transformers
     #   triton
-fire==0.6.0
+fire==0.7.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -79,24 +79,24 @@ frozenlist==1.4.1
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec[http]==2024.5.0
+fsspec[http]==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.0
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
@@ -121,7 +121,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   aiohttp
@@ -160,7 +160,7 @@ nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -181,11 +181,15 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvjitlink-cu12==12.6.77
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
@@ -198,7 +202,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -208,7 +212,7 @@ packaging==24.1
     #   datasets
     #   huggingface-hub
     #   transformers
-pandas==2.2.2
+pandas==2.2.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
@@ -224,27 +228,19 @@ pyarrow==17.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
-pyarrow-hotfix==0.6
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   datasets
 pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
@@ -255,7 +251,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-regex==2024.5.15
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   transformers
@@ -265,11 +261,11 @@ requests==2.32.3
     #   datasets
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   transformers
@@ -281,9 +277,8 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   asttokens
-    #   fire
     #   python-dateutil
-sympy==1.13.0
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -295,22 +290,23 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   transformers
-torch==2.3.1
+torch==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/llama/requirements.in
     #   fairscale
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.42.4
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/llama/requirements.in
-triton==2.3.1
+triton==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -318,17 +314,19 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
+    #   rich
     #   torch
-tzdata==2024.1
+tzdata==2024.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   pandas
-urllib3==1.26.19
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
@@ -337,11 +335,11 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/llama/requirements.in
-xxhash==3.4.1
+xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   datasets
-yarl==1.9.4
+yarl==1.13.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   aiohttp
diff --git a/benchmarks/llava/benchfile.py b/benchmarks/llava/benchfile.py
index 3bc06eaa..d6d40d6e 100644
--- a/benchmarks/llava/benchfile.py
+++ b/benchmarks/llava/benchfile.py
@@ -19,7 +19,9 @@ class Llava(Package):
     def make_env(self):
         # Return a dict of environment variables for prepare_script and
         # main_script.
-        return super().make_env()
+        env = super().make_env()
+        env["PT_HPU_LAZY_MODE"] = "0"
+        return env
 
     async def install(self):
         await super().install()  # super() call installs the requirements
diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py
index 879baca0..233ae2eb 100755
--- a/benchmarks/llava/main.py
+++ b/benchmarks/llava/main.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 
 from dataclasses import dataclass
-
 import torch
 from accelerate import Accelerator
 from accelerate.utils import set_seed
@@ -90,8 +89,11 @@ def batch_size_fn(batch):
     optimizer = observer.optimizer(torch.optim.AdamW(model.parameters(), lr=5e-5))
     model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
 
+    # model = torch.compile(model,backend="hpu_backend")
+
     for epoch in range(args.epochs):
         for i, batch in enumerate(observer.iterate(dataloader)):
+            print("HERE")
             images = batch["images"][0]  # Access the first item in the list of images
             texts = batch["texts"]
             prompt = apply_chat_template(texts)
@@ -124,7 +126,9 @@ def batch_size_fn(batch):
             if accelerator.sync_gradients:
                 accelerator.clip_grad_norm_(model.parameters(), 1.0)
 
+            compat.mark_step()
             optimizer.step()
+            compat.mark_step()
             optimizer.zero_grad()
             observer.record_loss(loss)
 
diff --git a/benchmarks/llava/requirements.hpu.txt b/benchmarks/llava/requirements.hpu.txt
new file mode 100644
index 00000000..3bd40dff
--- /dev/null
+++ b/benchmarks/llava/requirements.hpu.txt
@@ -0,0 +1,343 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/llava/requirements.hpu.txt .pin/tmp-constraints-hpu-llava-single.txt benchmarks/llava/requirements.in
+#
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llava/requirements.in
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+datasets==3.0.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llava/requirements.in
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   multiprocess
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llava/requirements.in
+    #   accelerate
+    #   datasets
+    #   pandas
+    #   pyarrow
+    #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llava/requirements.in
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   transformers
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   transformers
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llava/requirements.in
+    #   accelerate
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llava/requirements.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llava/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml
index ae5cf2af..22b52b79 100644
--- a/benchmarks/llm/configs/llama3_70B_full.yaml
+++ b/benchmarks/llm/configs/llama3_70B_full.yaml
@@ -82,7 +82,7 @@ optimizer:
   foreach: False
   # Note: highly recommended to use fused=True optimizer flag
   # with CPU offload for faster optimizer step.
-  fused: True
+  fused: False
 
 loss:
   _component_: torch.nn.CrossEntropyLoss
@@ -94,9 +94,9 @@ gradient_accumulation_steps: 1
 device: cuda
 
 # Memory management
-enable_activation_checkpointing: True
-memory_efficient_fsdp_wrap: True
-fsdp_cpu_offload: True
+enable_activation_checkpointing: false
+memory_efficient_fsdp_wrap: false
+fsdp_cpu_offload: false
 
 # Reduced precision
 dtype: bf16
diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py
index 3a51842d..19556ec7 100755
--- a/benchmarks/llm/recipes/full_finetune_distributed.py
+++ b/benchmarks/llm/recipes/full_finetune_distributed.py
@@ -16,6 +16,7 @@
 import torch
 from omegaconf import DictConfig, ListConfig
 
+import torchcompat.core as acc
 from torch import nn
 from torch.distributed import init_process_group
 from torch.distributed.fsdp import (
@@ -38,6 +39,8 @@
 
 log = utils.get_logger("DEBUG")
 
+HPU_UNSUPPORTED = False
+
 
 class FullFinetuneRecipeDistributed(FTRecipeInterface):
     """
@@ -98,7 +101,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
 
     def __init__(self, cfg: DictConfig) -> None:
 
-        self._device = utils.get_device(device=cfg.device)
+        import os
+        self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0")))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:
@@ -131,7 +135,10 @@ def __init__(self, cfg: DictConfig) -> None:
 
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
-        self.seed = utils.set_seed(seed=cfg.seed)
+        if HPU_UNSUPPORTED:
+            self.seed = utils.set_seed(seed=cfg.seed)
+        else:
+            self.seed = 1
         self.epochs_run = 0
         self.total_epochs = cfg.epochs
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
@@ -351,8 +358,10 @@ def _setup_model(
             )
 
         if self._is_rank_zero:
-            memory_stats = utils.get_memory_stats(device=self._device)
-            utils.log_memory_stats(memory_stats)
+            if HPU_UNSUPPORTED:
+                pass
+                #memory_stats = utils.get_memory_stats(device=self._device)
+                #utils.log_memory_stats(memory_stats)
 
         # synchronize before training begins
         torch.distributed.barrier()
@@ -413,6 +422,7 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
+            # persistent_workers=True,
             collate_fn=partial(
                 utils.padded_collate,
                 padding_idx=self._tokenizer.pad_id,
@@ -543,31 +553,14 @@ def train(self) -> None:
                         f"{curr_epoch+1}|{self.global_step}|Loss: {loss_to_log}"
                     )
 
-                    # Log per-step metrics
-                    if (
-                        self.global_step % self._log_every_n_steps == 0
-                        and self._is_rank_zero
-                    ):
-                        time_per_step = time.perf_counter() - t0
-                        log_dict = {
-                            "loss": loss_to_log,
-                            "lr": self._optimizer.param_groups[0]["lr"],
-                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
-                        }
-                        if self._log_peak_memory_stats:
-                            log_dict.update(utils.get_memory_stats(device=self._device))
-                        self._metric_logger.log_dict(
-                            log_dict,
-                            step=self.global_step,
-                        )
-
                     # Reset running stats for the next step
                     running_loss = 0
                     num_tokens = 0
                     t0 = time.perf_counter()
-
+                    
+            print("HERE")
             self.epochs_run += 1
-            self.save_checkpoint(epoch=curr_epoch)
+            # self.save_checkpoint(epoch=curr_epoch)
 
     def cleanup(self) -> None:
         if self._is_rank_zero:
@@ -618,7 +611,8 @@ def recipe_main(cfg: DictConfig) -> None:
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
 
-    init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
+    acc.init_process_group()
+
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU
diff --git a/benchmarks/llm/recipes/full_finetune_single_device.py b/benchmarks/llm/recipes/full_finetune_single_device.py
index 98322579..629b0e9a 100755
--- a/benchmarks/llm/recipes/full_finetune_single_device.py
+++ b/benchmarks/llm/recipes/full_finetune_single_device.py
@@ -97,7 +97,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
         # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor
         # enabled necessary features such as gradient scaling.
@@ -279,9 +279,9 @@ def _setup_model(
             log.info("Compiling model with torch.compile...")
             backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
             model.compile(backend=backend)
-        if self._device.type == "cuda":
-            memory_stats = utils.get_memory_stats(device=self._device)
-            utils.log_memory_stats(memory_stats)
+        # if self._device.type == "cuda":
+        #     memory_stats = utils.get_memory_stats(device=self._device)
+        #     utils.log_memory_stats(memory_stats)
 
         return model
 
@@ -487,8 +487,8 @@ def train(self) -> None:
                             ),
                             "tokens_per_second_per_gpu": num_tokens / time_per_step,
                         }
-                        if self._device.type == "cuda" and self._log_peak_memory_stats:
-                            log_dict.update(utils.get_memory_stats(device=self._device))
+                        # if self._device.type == "cuda" and self._log_peak_memory_stats:
+                        #     log_dict.update(utils.get_memory_stats(device=self._device))
                         self._metric_logger.log_dict(
                             log_dict,
                             step=self.global_step,
diff --git a/benchmarks/llm/recipes/lora_finetune_distributed.py b/benchmarks/llm/recipes/lora_finetune_distributed.py
index 18b736fb..ae7c5b40 100755
--- a/benchmarks/llm/recipes/lora_finetune_distributed.py
+++ b/benchmarks/llm/recipes/lora_finetune_distributed.py
@@ -16,6 +16,7 @@
 
 import torch
 from omegaconf import DictConfig, ListConfig
+import torchcompat.core as acc
 
 from torch import nn
 from torch.distributed import destroy_process_group, init_process_group
@@ -44,6 +45,9 @@
 log = utils.get_logger("DEBUG")
 
 
+HPU_UNSUPPORTED = False
+
+
 class LoRAFinetuneRecipeDistributed(FTRecipeInterface):
     """
     Distributed LoRA finetuning recipe for dense transformer-based LLMs such as Llama2. This recipe supports
@@ -108,7 +112,7 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0")))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:
@@ -132,7 +136,11 @@ def __init__(self, cfg: DictConfig) -> None:
 
         # These attributes constitute the recipe state and are updated by ``load_checkpoint``
         # when ``resume_from_checkpoint`` is ``True``
-        self.seed = utils.set_seed(seed=cfg.seed)
+        if HPU_UNSUPPORTED:
+            self.seed = utils.set_seed(seed=cfg.seed)
+        else:
+            self.seed = 1
+        
         self.epochs_run = 0
         self.total_epochs = cfg.epochs
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
@@ -428,7 +436,7 @@ def _setup_model(
             # Initialize empty modules on all non-zero ranks
             param_init_fn=(
                 lambda module: module.to_empty(
-                    device=torch.device("cuda"), recurse=False
+                    device=self._device, recurse=False
                 )
                 if not self._is_rank_zero
                 else None
@@ -443,8 +451,10 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerDecoderLayer}
             )
         if self._is_rank_zero:
-            memory_stats = utils.get_memory_stats(device=self._device)
-            utils.log_memory_stats(memory_stats)
+            if HPU_UNSUPPORTED:
+                pass
+                # memory_stats = utils.get_memory_stats(device=self._device)
+                # utils.log_memory_stats(memory_stats)
 
         # synchronize before training begins
         torch.distributed.barrier()
@@ -703,8 +713,9 @@ def train(self) -> None:
                             "lr": self._optimizer.param_groups[0]["lr"],
                             "tokens_per_second_per_gpu": num_tokens / time_per_step,
                         }
-                        if self._log_peak_memory_stats:
-                            log_dict.update(utils.get_memory_stats(device=self._device))
+                        # if self._log_peak_memory_stats:
+                        #     if HPU_UNSUPPORTED:
+                        #         log_dict.update(utils.get_memory_stats(device=self._device))
                         self._metric_logger.log_dict(
                             log_dict,
                             step=self.global_step,
@@ -773,7 +784,7 @@ def recipe_main(cfg: DictConfig) -> None:
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-    init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
+    acc.init_process_group()
 
     config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg)
 
diff --git a/benchmarks/llm/recipes/lora_finetune_single_device.py b/benchmarks/llm/recipes/lora_finetune_single_device.py
index cf5256ea..9060d203 100755
--- a/benchmarks/llm/recipes/lora_finetune_single_device.py
+++ b/benchmarks/llm/recipes/lora_finetune_single_device.py
@@ -101,8 +101,9 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-
-        self._device = utils.get_device(device=cfg.device)
+        import torchcompat.core as accelerator
+         
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         # Reduced precision logic
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
         # fp16 precision is explicitly disabled as it is not supported in this
@@ -388,9 +389,9 @@ def _setup_model(
             log.info("Compiling model with torch.compile...")
             backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
             model.compile(backend=backend)
-        if self._device.type == "cuda":
-            memory_stats = utils.get_memory_stats(device=self._device)
-            utils.log_memory_stats(memory_stats)
+        # if self._device.type == "cuda":
+        #     memory_stats = utils.get_memory_stats(device=self._device)
+        #     utils.log_memory_stats(memory_stats)
         return model
 
     def _setup_optimizer(
@@ -528,7 +529,8 @@ def train(self) -> None:
         """
         The core training loop.
         """
-
+        import torchcompat.core as accelerator
+        
         if self._model_compile:
             log.info(
                 "NOTE: torch.compile is enabled and model is compiled in first forward. Expect a relatively slow first iteration."
@@ -579,10 +581,13 @@ def train(self) -> None:
                     loss = self._loss_fn(logits, labels) / self._gradient_accumulation_steps
                     running_loss += loss
                     loss.backward()
+                    accelerator.mark_step()
 
                     # Step with optimizer
                     if (idx + 1) % self._gradient_accumulation_steps == 0:
                         self._optimizer.step()
+                        accelerator.mark_step()
+                        
                         self._optimizer.zero_grad(set_to_none=True)
                         self._lr_scheduler.step()
                         # Update the number of steps when the weights are updated
@@ -603,13 +608,13 @@ def train(self) -> None:
                                 "lr": self._optimizer.param_groups[0]["lr"],
                                 "tokens_per_second_per_gpu": num_tokens / time_per_step,
                             }
-                            if (
-                                self._device.type == "cuda"
-                                and self._log_peak_memory_stats
-                            ):
-                                log_dict.update(
-                                    utils.get_memory_stats(device=self._device)
-                                )
+                            # if (
+                            #     self._device.type == "cuda"
+                            #     and self._log_peak_memory_stats
+                            # ):
+                            #     log_dict.update(
+                            #         utils.get_memory_stats(device=self._device)
+                            #     )
                             self._metric_logger.log_dict(
                                 log_dict,
                                 step=self.global_step,
diff --git a/benchmarks/llm/recipes/ppo_full_finetune_single_device.py b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py
index 8ee77c06..fbf8630a 100644
--- a/benchmarks/llm/recipes/ppo_full_finetune_single_device.py
+++ b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py
@@ -496,9 +496,9 @@ def _setup_model(
             ref_policy_model.compile(backend=backend)
             value_model.compile(backend=backend)
 
-        if self._device.type == "cuda":
-            memory_stats = utils.get_memory_stats(device=self._device)
-            utils.log_memory_stats(memory_stats)
+        # if self._device.type == "cuda":
+        #     memory_stats = utils.get_memory_stats(device=self._device)
+        #     utils.log_memory_stats(memory_stats)
 
         return policy_model, value_model, reward_model, ref_policy_model
 
@@ -1031,8 +1031,8 @@ def log_metrics(
             "approx_policy_kl": ppo_stats.approx_policy_kls.mean(),
             "response_lengths": trajectory.seq_lens.float().mean(),
         }
-        if self._device.type == "cuda" and self._log_peak_memory_stats:
-            log_dict.update(utils.get_memory_stats(device=self._device))
+        # if self._device.type == "cuda" and self._log_peak_memory_stats:
+        #     log_dict.update(utils.get_memory_stats(device=self._device))
 
         self._metric_logger.log_dict(log_dict, step=self.global_step)
 
diff --git a/benchmarks/llm/requirements.hpu.txt b/benchmarks/llm/requirements.hpu.txt
new file mode 100644
index 00000000..9b88be53
--- /dev/null
+++ b/benchmarks/llm/requirements.hpu.txt
@@ -0,0 +1,408 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/llm/requirements.hpu.txt .pin/tmp-constraints-hpu-llm-full-mp-nodes.txt benchmarks/llm/requirements.in
+#
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.in
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.in
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+blobfile==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.txt
+    #   torchtune
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+datasets==3.0.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchtune
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   multiprocess
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+fairscale==0.4.13
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blobfile
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+fire==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.txt
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   argklass
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   tokenizers
+    #   torchtune
+    #   transformers
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   yarl
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   argklass
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+lxml==5.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blobfile
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   fairscale
+    #   pandas
+    #   pyarrow
+    #   torchtune
+    #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchtune
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+pycryptodomex==3.21.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blobfile
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.in
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   tiktoken
+    #   transformers
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   torchtune
+    #   transformers
+sentencepiece==0.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchtune
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+termcolor==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   fire
+tiktoken==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchtune
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   transformers
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+    #   accelerate
+    #   fairscale
+torchao==0.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llm/requirements.in
+    #   torchtune
+torchtune==0.2.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llm/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torchtune
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llm/requirements.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blobfile
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/llm/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
diff --git a/benchmarks/purejaxrl/requirements.hpu.txt b/benchmarks/purejaxrl/requirements.hpu.txt
new file mode 100644
index 00000000..aeb2b110
--- /dev/null
+++ b/benchmarks/purejaxrl/requirements.hpu.txt
@@ -0,0 +1,743 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/purejaxrl/requirements.hpu.txt .pin/tmp-constraints-hpu-ppo.txt benchmarks/purejaxrl/requirements.in
+#
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   chex
+    #   distrax
+    #   dm-env
+    #   ml-collections
+    #   mujoco
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+    #   rlax
+    #   tensorflow-probability
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+astroid==3.3.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pylint
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+black==24.8.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+blinker==1.8.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flask
+brax==0.10.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+chex==0.1.87
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   optax
+    #   rlax
+click==8.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   black
+    #   flask
+    #   wandb
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gym
+    #   gymnasium
+    #   tensorflow-probability
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+contextlib2==21.6.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ml-collections
+contourpy==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   matplotlib
+cycler==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   matplotlib
+decorator==5.1.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorflow-probability
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pylint
+distrax==0.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   rlax
+dm-env==1.6
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   rlax
+dm-tree==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   dm-env
+    #   tensorflow-probability
+docker-pycreds==0.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+dotmap==1.3.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   evosax
+etils[epath,epy]==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   mujoco
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+evosax==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+exceptiongroup==1.2.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pytest
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+farama-notifications==0.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gymnasium
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+flake8==7.1.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+flashbax==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+flask==3.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   flask-cors
+flask-cors==5.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+flax==0.9.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   navix
+fonttools==4.54.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   matplotlib
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   etils
+    #   torch
+gast==0.6.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorflow-probability
+gitdb==4.0.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gitpython
+gitpython==3.1.43
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+glfw==2.7.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   mujoco
+grpcio==1.66.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+gym==0.26.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   gymnax
+gym-notices==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gym
+gymnasium==0.29.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gymnax
+gymnax==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   argklass
+humanize==4.10.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   orbax-checkpoint
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   argklass
+    #   etils
+iniconfig==2.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pytest
+isort==5.13.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pylint
+itsdangerous==2.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flask
+jax==0.4.33
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   flax
+    #   gymnax
+    #   jaxopt
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+    #   rlax
+jaxlib==0.4.33
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   jax
+    #   jaxopt
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+    #   rlax
+jaxopt==0.8.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   flask
+    #   torch
+kiwisolver==1.4.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   matplotlib
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+    #   werkzeug
+matplotlib==3.9.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   evosax
+    #   gymnax
+    #   seaborn
+mccabe==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flake8
+    #   pylint
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+ml-collections==0.1.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jax
+    #   jaxlib
+    #   tensorstore
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flax
+    #   orbax-checkpoint
+mujoco==3.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   mujoco-mjx
+mujoco-mjx==3.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+mypy-extensions==1.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   black
+navix==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+nest-asyncio==1.6.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   orbax-checkpoint
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   chex
+    #   contourpy
+    #   distrax
+    #   dm-env
+    #   evosax
+    #   flashbax
+    #   gym
+    #   gymnasium
+    #   jax
+    #   jaxlib
+    #   jaxopt
+    #   matplotlib
+    #   ml-dtypes
+    #   mujoco
+    #   navix
+    #   optax
+    #   orbax-checkpoint
+    #   pandas
+    #   rlax
+    #   scipy
+    #   seaborn
+    #   tensorboardx
+    #   tensorflow-probability
+    #   tensorstore
+    #   trimesh
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+opt-einsum==3.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jax
+optax==0.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   flax
+orbax-checkpoint==0.6.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   flax
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   black
+    #   matplotlib
+    #   pytest
+    #   setuptools-scm
+    #   tensorboardx
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   seaborn
+pathspec==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   black
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   matplotlib
+    #   navix
+platformdirs==4.3.6
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   black
+    #   pylint
+    #   wandb
+pluggy==1.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pytest
+protobuf==5.28.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   orbax-checkpoint
+    #   tensorboardx
+    #   wandb
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pycodestyle==2.12.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flake8
+pyflakes==3.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flake8
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pylint==3.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+pyopengl==3.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   mujoco
+pyparsing==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   matplotlib
+pytest==8.3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   matplotlib
+    #   pandas
+pytinyrenderer==0.0.14
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   evosax
+    #   flax
+    #   gymnax
+    #   ml-collections
+    #   omegaconf
+    #   orbax-checkpoint
+    #   wandb
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flax
+    #   tyro
+    #   voir
+rlax==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   jax
+    #   jaxlib
+    #   jaxopt
+    #   mujoco-mjx
+seaborn==0.13.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gymnax
+sentry-sdk==2.15.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+setproctitle==1.3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+setuptools-scm==8.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   docker-pycreds
+    #   ml-collections
+    #   python-dateutil
+    #   tensorflow-probability
+smmap==5.0.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gitdb
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tensorboardx==2.6.2.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+tensorflow-probability==0.24.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   distrax
+tensorstore==0.1.66
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flashbax
+    #   flax
+    #   orbax-checkpoint
+tomli==2.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   black
+    #   pylint
+    #   pytest
+    #   setuptools-scm
+tomlkit==0.13.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pylint
+toolz==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   chex
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+trimesh==4.4.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   brax
+    #   mujoco-mjx
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   astroid
+    #   black
+    #   brax
+    #   chex
+    #   etils
+    #   flashbax
+    #   flax
+    #   gymnasium
+    #   navix
+    #   orbax-checkpoint
+    #   reactivex
+    #   rich
+    #   torch
+    #   tyro
+tyro==0.8.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   sentry-sdk
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+wandb==0.18.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   navix
+werkzeug==3.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   flask
+zipp==3.20.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   etils
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/recursiongfn/requirements.hpu.txt b/benchmarks/recursiongfn/requirements.hpu.txt
new file mode 100644
index 00000000..4e362ae6
--- /dev/null
+++ b/benchmarks/recursiongfn/requirements.hpu.txt
@@ -0,0 +1,493 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/recursiongfn/requirements.hpu.txt .pin/tmp-constraints-hpu-recursiongfn.txt benchmarks/recursiongfn/requirements.in
+#
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+blosc2==2.7.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tables
+botorch==0.12.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+click==8.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+cvxopt==1.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+docker-pycreds==0.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   torch-geometric
+gitdb==4.0.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gitpython
+gitpython==3.1.43
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   wandb
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+gpytorch==1.13
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+grpcio==1.66.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   yarl
+jaxtyping==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gpytorch
+    #   linear-operator
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   scikit-learn
+linear-operator==0.5.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   botorch
+    #   gpytorch
+markdown==3.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   botorch
+    #   gpytorch
+    #   linear-operator
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blosc2
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   botorch
+ndindex==1.9.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blosc2
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torch
+numexpr==2.10.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blosc2
+    #   tables
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blosc2
+    #   jaxtyping
+    #   numexpr
+    #   pandas
+    #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   scikit-learn
+    #   scipy
+    #   tables
+    #   tensorboard
+    #   torch-geometric
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   voir
+opt-einsum==3.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pyro-ppl
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tables
+    #   tensorboard
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rdkit
+platformdirs==4.3.6
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+protobuf==5.28.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+    #   wandb
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+py-cpuinfo==9.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   blosc2
+    #   tables
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pyparsing==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+pyro-api==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+    #   wandb
+rdkit==2024.3.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch-geometric
+    #   wandb
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+scikit-learn==1.5.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gpytorch
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   gpytorch
+    #   linear-operator
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-sparse
+sentry-sdk==2.15.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+setproctitle==1.3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   wandb
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   docker-pycreds
+    #   python-dateutil
+    #   tensorboard
+smmap==5.0.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gitdb
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tables==3.10.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tensorboard==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tensorboard-data-server==0.7.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+threadpoolctl==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   scikit-learn
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+    #   linear-operator
+    #   pyro-ppl
+torch-cluster==1.6.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-scatter==2.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-sparse==0.6.18
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pyro-ppl
+    #   torch-geometric
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typeguard==4.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jaxtyping
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   botorch
+    #   jaxtyping
+    #   multidict
+    #   reactivex
+    #   rich
+    #   tables
+    #   torch
+    #   typeguard
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   sentry-sdk
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+wandb==0.18.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+werkzeug==3.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/rlhf/main.py b/benchmarks/rlhf/main.py
index 0be12d28..3a5f1dda 100755
--- a/benchmarks/rlhf/main.py
+++ b/benchmarks/rlhf/main.py
@@ -2,6 +2,7 @@
 
 import shutil
 
+import accelerate
 from accelerate import PartialState
 from datasets import load_dataset
 from transformers import (
@@ -15,10 +16,16 @@
 from trl.trainer.ppov2_trainer import PPOv2Config, PPOv2Trainer
 from trl.trainer.utils import SIMPLE_QUERY_CHAT_TEMPLATE
 
+import torchcompat.core as compat
+
 
 class PPOv2TrainerIntrumented(PPOv2Trainer):
     def __init__(self, config: PPOv2Config, *args, **kwargs):
         config.report_to = []
+        
+        # FIXME: better way to monkeypatch this ?
+        # Use the compatibility accelerator class
+        accelerate.Accelerator = compat.accelerate.Accelerator
         super().__init__(config, *args, **kwargs)
 
         def batch_size_fn(batch):
@@ -46,9 +53,13 @@ def save_model(self, *args, **kwargs):
 
 
 def main():
+    
 
     parser = HfArgumentParser((PPOv2Config, ModelConfig))
     config, model_config = parser.parse_args_into_dataclasses()
+    
+    import torchcompat.core
+    
     # remove output_dir if exists
     shutil.rmtree(config.output_dir, ignore_errors=True)
 
diff --git a/benchmarks/rlhf/requirements.hpu.txt b/benchmarks/rlhf/requirements.hpu.txt
new file mode 100644
index 00000000..a6c12765
--- /dev/null
+++ b/benchmarks/rlhf/requirements.hpu.txt
@@ -0,0 +1,362 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/rlhf/requirements.hpu.txt .pin/tmp-constraints-hpu-rlhf-gpus.txt benchmarks/rlhf/requirements.in
+#
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+datasets==3.0.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   multiprocess
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   pandas
+    #   pyarrow
+    #   transformers
+    #   trl
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   accelerate
+    #   transformers
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   transformers
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   accelerate
+    #   trl
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+trl==0.10.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/rlhf/requirements.in
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+    #   tyro
+tyro==0.8.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   trl
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/rlhf/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   datasets
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   aiohttp
diff --git a/benchmarks/timm/requirements.hpu.txt b/benchmarks/timm/requirements.hpu.txt
index 432c91bc..e626bd1f 100644
--- a/benchmarks/timm/requirements.hpu.txt
+++ b/benchmarks/timm/requirements.hpu.txt
@@ -4,10 +4,6 @@
 #
 #    pip-compile --output-file=benchmarks/timm/requirements.hpu.txt .pin/tmp-constraints-hpu-timm.txt benchmarks/timm/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
 antlr4-python3-runtime==4.9.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -16,7 +12,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-certifi==2024.6.2
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
@@ -24,35 +20,35 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.0
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/timm/requirements.in
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
@@ -102,7 +98,7 @@ nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -123,11 +119,15 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvjitlink-cu12==12.6.77
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
@@ -140,7 +140,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -164,11 +164,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -182,11 +178,11 @@ requests==2.32.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-safetensors==0.4.3
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -194,24 +190,24 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-torch==2.3.1
+torch==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/timm/requirements.in
     #   torchvision
-torchvision==0.18.1
+torchvision==0.19.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/timm/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
-triton==2.3.1
+triton==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -220,12 +216,13 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   huggingface-hub
     #   reactivex
+    #   rich
     #   torch
-urllib3==1.26.19
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
diff --git a/benchmarks/torchatari/requirements.hpu.txt b/benchmarks/torchatari/requirements.hpu.txt
new file mode 100644
index 00000000..6d7369df
--- /dev/null
+++ b/benchmarks/torchatari/requirements.hpu.txt
@@ -0,0 +1,304 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/torchatari/requirements.hpu.txt .pin/tmp-constraints-hpu-torchatari.txt benchmarks/torchatari/requirements.in
+#
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   dm-env
+    #   tensorboard
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+appdirs==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   cantilever
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+cantilever==0.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gym
+    #   gymnasium
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+dm-env==1.6
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   envpool
+dm-tree==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   dm-env
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+envpool==0.8.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+farama-notifications==0.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gymnasium
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+grpcio==1.66.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+gym==0.26.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+    #   envpool
+gym-notices==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   gym
+gymnasium==0.29.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   envpool
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   cantilever
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markdown==3.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+    #   werkzeug
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+    #   dm-env
+    #   envpool
+    #   gym
+    #   gymnasium
+    #   tensorboard
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+optree==0.13.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   envpool
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   envpool
+    #   tensorboard
+protobuf==5.28.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+    #   voir
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   tensorboard
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+tensorboard==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+tensorboard-data-server==0.7.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/torchatari/requirements.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+types-protobuf==5.28.0.20240924
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   envpool
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   envpool
+    #   gymnasium
+    #   optree
+    #   reactivex
+    #   rich
+    #   torch
+    #   tyro
+tyro==0.8.11
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchatari/requirements.in
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/torchatari/requirements.in
+werkzeug==3.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   tensorboard
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/torchvision/requirements.hpu.txt b/benchmarks/torchvision/requirements.hpu.txt
index 369a1753..f0b47e91 100644
--- a/benchmarks/torchvision/requirements.hpu.txt
+++ b/benchmarks/torchvision/requirements.hpu.txt
@@ -4,10 +4,6 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision/requirements.hpu.txt .pin/tmp-constraints-hpu-torchvision.txt benchmarks/torchvision/requirements.in
 #
---extra-index-url https://pypi.ngc.nvidia.com
---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
---trusted-host pypi.ngc.nvidia.com
-
 antlr4-python3-runtime==4.9.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
@@ -16,29 +12,29 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
     #   triton
-fsspec==2024.5.0
+fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.0
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torchcompat
@@ -88,7 +84,7 @@ nvidia-cuda-runtime-cu12==12.1.105
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -109,11 +105,15 @@ nvidia-cusparse-cu12==12.1.0.106
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvjitlink-cu12==12.6.77
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   nvidia-cusolver-cu12
@@ -126,7 +126,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-ovld==0.3.5
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -146,11 +146,7 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-hpu-torch.txt
-    #   voir
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   omegaconf
@@ -158,7 +154,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
@@ -166,11 +162,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   asttokens
-sympy==1.13.0
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
-torch==2.3.1
+torch==2.4.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/torchvision/requirements.in
@@ -180,15 +176,15 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -c .pin/../constraints/hpu.txt
     #   -r benchmarks/torchvision/requirements.in
-torchvision==0.18.1
+torchvision==0.19.1
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/torchvision/requirements.in
-tqdm==4.66.4
+tqdm==4.66.5
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   -r benchmarks/torchvision/requirements.in
-triton==2.3.1
+triton==3.0.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   torch
@@ -196,8 +192,9 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   giving
diff --git a/benchmarks/torchvision_ddp/requirements.hpu.txt b/benchmarks/torchvision_ddp/requirements.hpu.txt
index e69de29b..a4174e7b 100644
--- a/benchmarks/torchvision_ddp/requirements.hpu.txt
+++ b/benchmarks/torchvision_ddp/requirements.hpu.txt
@@ -0,0 +1,205 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/torchvision_ddp/requirements.hpu.txt .pin/tmp-constraints-hpu-torchvision.txt benchmarks/torchvision_ddp/requirements.in
+#
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchcompat
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchvision
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchvision_ddp/requirements.in
+    #   torchvision
+torchcompat==1.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/torchvision_ddp/requirements.in
+torchvision==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchvision_ddp/requirements.in
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/torchvision_ddp/requirements.in
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   reactivex
+    #   rich
+    #   torch
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/torchvision_ddp/requirements.in
diff --git a/benchmarks/vjepa/benchfile.py b/benchmarks/vjepa/benchfile.py
index d25b47b5..228023ce 100644
--- a/benchmarks/vjepa/benchfile.py
+++ b/benchmarks/vjepa/benchfile.py
@@ -23,7 +23,9 @@ class Vjepa(Package):
     def make_env(self):
         # Return a dict of environment variables for prepare_script and
         # main_script.
-        return super().make_env()
+        env = super().make_env()
+        env["PT_HPU_LAZY_MODE"] = "0"
+        return env
 
     async def install(self):
         vjepa = self.dirs.code / "jepa"
diff --git a/benchmarks/vjepa/main.py b/benchmarks/vjepa/main.py
index 18377b92..55981859 100644
--- a/benchmarks/vjepa/main.py
+++ b/benchmarks/vjepa/main.py
@@ -475,14 +475,19 @@ def reg_fn(z):
                     scaler.unscale_(optimizer)
                 else:
                     loss.backward()
+                
                 if (epoch > warmup) and (clip_grad is not None):
                     _enc_norm = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip_grad)
                     _pred_norm = torch.nn.utils.clip_grad_norm_(predictor.parameters(), clip_grad)
+                
+                acc.mark_step()
                 if mixed_precision:
                     scaler.step(optimizer)
                     scaler.update()
                 else:
                     optimizer.step()
+                acc.mark_step()
+                    
                 grad_stats = grad_logger(encoder.named_parameters())
                 grad_stats.global_norm = float(_enc_norm)
                 grad_stats_pred = grad_logger(predictor.named_parameters())
@@ -506,7 +511,8 @@ def reg_fn(z):
                     grad_stats_pred,
                     optim_stats,
                 )
-            (loss, loss_jepa, loss_reg, _new_lr, _new_wd, grad_stats, grad_stats_pred, optim_stats,), gpu_etime_ms = gpu_timer(train_step)
+            loss, loss_jepa, loss_reg, _new_lr, _new_wd, grad_stats, grad_stats_pred, optim_stats = train_step()
+            
             iter_elapsed_time_ms = (time.time() - itr_start_time) * 1000.
             loss_meter.update(loss)
             input_var = float(AllReduce.apply(clips.view(clips.shape[0], -1).var(dim=1).mean(dim=0)))
@@ -515,7 +521,7 @@ def reg_fn(z):
             input_var_min_meter.update(input_var_min)
             jepa_loss_meter.update(loss_jepa)
             reg_loss_meter.update(loss_reg)
-            gpu_time_meter.update(gpu_etime_ms)
+            # gpu_time_meter.update(gpu_etime_ms)
             wall_time_meter.update(iter_elapsed_time_ms)
 
             observer.record_loss(loss)
@@ -530,7 +536,6 @@ def log_stats():
                     loss_reg,
                     grad_stats.global_norm,
                     grad_stats_pred.global_norm,
-                    gpu_etime_ms,
                     iter_elapsed_time_ms)
                 if (itr % log_freq == 0) or np.isnan(loss) or np.isinf(loss):
                     logger.info(
@@ -637,7 +642,11 @@ def main():
     params["nodes"] = nnodes
     params["tasks_per_node"] = gpu_per_nodes
 
+    print("HERE", os.getenv("RANK", -1) )
     if os.getenv("RANK", -1) != -1:
+        print("INIT PROCESS GROUP HERE")
+        print(acc)
+        print(acc.init_process_group)
         acc.init_process_group()
 
     try:
diff --git a/benchmarks/vjepa/requirements.hpu.txt b/benchmarks/vjepa/requirements.hpu.txt
new file mode 100644
index 00000000..b1c986ec
--- /dev/null
+++ b/benchmarks/vjepa/requirements.hpu.txt
@@ -0,0 +1,297 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/vjepa/requirements.hpu.txt .pin/tmp-constraints-hpu-vjepa-gpus.txt benchmarks/vjepa/requirements.in
+#
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+beartype==0.19.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+braceexpand==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   webdataset
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   submitit
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+decord==0.6.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+einops==0.8.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   timm
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   decord
+    #   opencv-python
+    #   pandas
+    #   torchvision
+    #   webdataset
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+opencv-python==4.10.0.84
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   huggingface-hub
+    #   omegaconf
+    #   timm
+    #   webdataset
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   timm
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   asttokens
+    #   python-dateutil
+submitit==1.5.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+timm==1.0.9
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+torch==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+    #   torchvision
+torchvision==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   huggingface-hub
+    #   reactivex
+    #   rich
+    #   submitit
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -c .pin/../constraints/hpu.txt
+    #   -r benchmarks/vjepa/requirements.in
+webdataset==0.2.100
+    # via
+    #   -c .pin/../.pin/constraints-hpu-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
diff --git a/config/base.yaml b/config/base.yaml
index d7926799..1a64d550 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -348,7 +348,7 @@ reformer:
     - monogpu
   argv:
     --model: "Reformer"
-    --batch-size: 64
+    --batch-size: 32
 
 whisper:
   inherits: _hf
@@ -541,7 +541,7 @@ _llm:
   tags:
     - nlp
     - llm
-  max_duration: 1200
+  max_duration: 3600
   num_machines: 1
   inherits: _defaults
   definition: ../benchmarks/llm
@@ -566,6 +566,7 @@ llm-lora-single:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
+    device={device_name}: true
 
 
 llm-lora-ddp-gpus:
@@ -587,7 +588,7 @@ llm-lora-ddp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    device={device_name}: true
 
 llm-lora-ddp-nodes:
   tags:
@@ -610,7 +611,7 @@ llm-lora-ddp-nodes:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    device={device_name}: true
   num_machines: 2
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
@@ -636,8 +637,12 @@ llm-lora-mp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-70B": true
     batch_size=8: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+  
 llm-full-mp-gpus:
+  voir:
+    options:
+      stop: 30
   inherits: _llm
   tags:
     - multigpu
@@ -658,7 +663,8 @@ llm-full-mp-gpus:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+  
 llm-full-mp-nodes:
   tags:
     - multinode
@@ -681,7 +687,8 @@ llm-full-mp-nodes:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+  
   num_machines: 2
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
@@ -781,6 +788,7 @@ torchatari:
     --env-id: Breakout-v5
 
 _llava:
+  max_duration: 3600
   inherits: _defaults
   definition: ../benchmarks/llava
   install_group: torch
diff --git a/constraints/extra/torch.hpu.txt b/constraints/extra/torch.hpu.txt
index 1d21c177..e69de29b 100644
--- a/constraints/extra/torch.hpu.txt
+++ b/constraints/extra/torch.hpu.txt
@@ -1,5 +0,0 @@
-
-#
-#
-voir >= 0.2.15
-torchcompat >= 1.0.0
diff --git a/constraints/hpu.txt b/constraints/hpu.txt
index 23a110bd..9f6fe957 100644
--- a/constraints/hpu.txt
+++ b/constraints/hpu.txt
@@ -1,8 +1,16 @@
-# FIXME
-# Add
-
 #
 #
 voir >= 0.2.19
 torchcompat >= 1.0.0
-gymnax >= 0.0.8
\ No newline at end of file
+gymnax >= 0.0.8
+trl<0.11.0
+
+# latest torchtune is slower than before and cause failures
+# next version of pytorch seems to work better
+# so pending a new version of pytorch this is what we get
+torchtune<0.3.0
+
+# transformers added torchao support recently
+# but only the most recent version we do not support
+transformers<4.45.0
+torchvision
\ No newline at end of file
diff --git a/docker/Dockerfile-hpu b/docker/Dockerfile-hpu
new file mode 100644
index 00000000..932959cd
--- /dev/null
+++ b/docker/Dockerfile-hpu
@@ -0,0 +1,42 @@
+# FROM artifactory-kfs.habana-labs.com/docker-local/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:1.17.0-462
+
+FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+
+ENV MILABENCH_GPU_ARCH=hpu
+
+WORKDIR /workspace
+
+ENV MILABENCH_CONFIG="/workspace/milabench/config/standard.yaml"
+
+ENV MILABENCH_WORDIR="/workspace/${MILABENCH_GPU_ARCH}"
+ENV MILABENCH_BASE="${MILABENCH_WORDIR}/results"
+ENV MILABENCH_VENV="${MILABENCH_WORDIR}/env"
+ENV BENCHMARK_VENV="${MILABENCH_WORDIR}/results/venv/torch"
+
+ARG BENCH=lightning
+
+RUN mkdir -p ${MILABENCH_WORDIR}
+RUN pip install virtualenv
+RUN virtualenv --system-site-packages $MILABENCH_VENV
+
+ARG CACHEBUST=1 
+RUN echo "$CACHEBUST"
+RUN git clone https://github.com/mila-iqia/milabench.git -b $MILABENCH_GPU_ARCH
+RUN $MILABENCH_VENV/bin/pip install -e milabench
+
+RUN . $MILABENCH_VENV/bin/activate && milabench install --use-current-env --select "${BENCH}"
+
+RUN $MILABENCH_VENV/bin/pip uninstall torch torchvision torchaudio -y 
+RUN sed -i 's/pic.numpy(force=True)/pic.numpy()/' /usr/local/lib/python3.10/dist-packages/torchvision/transforms/functional.py
+
+# This does not work
+# RUN . $MILABENCH_VENV/bin/activate && milabench prepare --use-current-env --select "${BENCH}"
+
+
+
+# RUN . $MILABENCH_VENV/bin/activate && milabench run --use-current-env --select $BENCH
+# RUN huggingface-cli login --token $MILABENCH_HF_TOKEN
+
+# docker build --build-arg CACHEBUST=`git rev-parse hpu` -f Dockerfile-hpu -t dockerfile-hpu . 
+# docker run   -it   --runtime=habana   -e HABANA_VISIBLE_DEVICES=all   -e OMPI_MCA_btl_vader_single_copy_mechanism=none   --shm-size 50G   --cap-add=sys_nice   --net=host   dockerfile-hpu:latest   bash
+# . $MILABENCH_VENV/bin/activate && milabench prepare --use-current-env --select lightning && milabench run --use-current-env --select lightning
diff --git a/docker/Makefile b/docker/Makefile
new file mode 100644
index 00000000..93a40270
--- /dev/null
+++ b/docker/Makefile
@@ -0,0 +1,17 @@
+
+
+
+bench = rlhf-gpus
+# bench = "lightning"
+lazy = 0
+
+hpu:
+	git add --all
+	git commit -m "-" | true
+	git push origin hpu
+	docker rmi -f $(docker images --filter "dangling=true" -q --no-trunc) | true
+	# docker system prune -a -f
+	# docker image prune -a -f
+	docker build --build-arg BENCH=$(bench) --build-arg CACHEBUST=`git rev-parse hpu` -f Dockerfile-hpu -t dockerfile-hpu . 
+	docker run --rm -it   --runtime=habana  -e PT_HPU_LAZY_MODE=$(lazy) -e HABANA_VISIBLE_DEVICES=all   -e OMPI_MCA_btl_vader_single_copy_mechanism=none --shm-size 50G   --cap-add=sys_nice   --net=host   dockerfile-hpu:latest bash -c '. $$MILABENCH_VENV/bin/activate && milabench install --use-current-env --select $(bench) && pip uninstall torch torchvision torchaudio -y  &&  milabench prepare --use-current-env --select $(bench) && milabench run --use-current-env $(args) --select $(bench)'
+
diff --git a/milabench/_version.py b/milabench/_version.py
index e2795a03..a5982314 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-12-g3b87cb4"
-__commit__ = "3b87cb465e855be452953273c314ab01024e0925"
-__date__ = "2024-10-09 12:04:43 -0400"
+__tag__ = "v1.0.0_RC1-50-gd2c8ba2"
+__commit__ = "d2c8ba2c67e19026293381bdbddeb3f30ba0ee64"
+__date__ = "2024-11-11 18:51:45 +0000"
diff --git a/milabench/remote.py b/milabench/remote.py
index 7e1eef85..cbe9696b 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -100,7 +100,7 @@ def worker_commands(pack, worker_plan, setup_for="worker"):
 def sshnode(node, cmd):
     host = node["ip"]
     user = node["user"]
-    port = node["sshport"]
+    port = node.get("sshport", 22)
     return SSHCommand(cmd, user=user, host=host, port=port)
 
 
diff --git a/milabench/system.py b/milabench/system.py
index 3a50d143..2d5a6ca8 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -328,7 +328,7 @@ def _fix_weird(hostname):
 
 # If true that means we cannot resolve the ip addresses
 # so we ignore errors
-offline = False
+offline = True
 
 
 @contextmanager
diff --git a/scripts/article/run_hpu.sh b/scripts/article/run_hpu.sh
index 5d875ca1..8f6126d2 100644
--- a/scripts/article/run_hpu.sh
+++ b/scripts/article/run_hpu.sh
@@ -9,68 +9,84 @@ set -ex
 export MILABENCH_GPU_ARCH=hpu
 export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH"
 export MILABENCH_BASE="$MILABENCH_WORDIR/results"
-export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
 export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
+export PT_HPU_LAZY_MODE=0
+
+if [ -z "${MILABENCH_SOURCE}" ]; then
+    export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
+else
+    export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml"
+fi
 
 if [ -z "${MILABENCH_PREPARE}" ]; then
     export MILABENCH_PREPARE=0
 fi
 
+ARGS="$@"
+
 install_prepare() {
     mkdir -p $MILABENCH_WORDIR
     cd $MILABENCH_WORDIR
 
     virtualenv $MILABENCH_WORDIR/env
 
-    git clone https://github.com/mila-iqia/milabench.git
-    git clone https://github.com/huggingface/optimum-habana.git
+    if [ -z "${MILABENCH_SOURCE}" ]; then
+        if [ ! -d "$MILABENCH_WORDIR/milabench" ]; then
+            git clone https://github.com/mila-iqia/milabench.git
+        fi
+        export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench"
+    fi
+
+    git clone https://github.com/huggingface/optimum-habana.git -b v1.13.2
 
     # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh
-    wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh
+    # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh
+    wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.17.1/habanalabs-installer.sh
     chmod +x habanalabs-installer.sh
 
     . $MILABENCH_WORDIR/env/bin/activate
-    pip install -e $MILABENCH_WORDIR/milabench
-
-
-    #
-    # Install milabench's benchmarks in their venv
-    #
-    milabench install
+    pip install -e $MILABENCH_SOURCE
 
     which pip
 
     # Override dependencies for HPU
     # milabench needs pyhlml
     export HABANALABS_VIRTUAL_DIR=$MILABENCH_VENV
-    ./habanalabs-installer.sh install -t dependencies --venv -y
-    ./habanalabs-installer.sh install -t pytorch --venv -y
+    ./habanalabs-installer.sh install -t dependencies --venv -y | true
+    ./habanalabs-installer.sh install -t pytorch --venv -y | true
+
+    #
+    # Install milabench's benchmarks in their venv
+    #
+    # milabench pin --variant hpu --from-scratch $ARGS 
+    milabench install $ARGS
 
     (
         . $BENCHMARK_VENV/bin/activate
         which pip
-        pip install -e $MILABENCH_WORDIR/optimum-habana
-
-        (
-            cd $MILABENCH_WORDIR/milabench/benchmarks/dlrm/dlrm;
-            git remote add me https://github.com/Delaunay/dlrm.git
-            git fetch me
-            git checkout me/main
-        )
+        pip install --no-deps -e $MILABENCH_WORDIR/optimum-habana 
 
         # Override dependencies for HPU
         # benchmarks need pytorch
-        pip uninstall torch torchvision torchaudio
+        pip uninstall torch torchvision torchaudio -y
         export HABANALABS_VIRTUAL_DIR=$BENCHMARK_VENV
-        ./habanalabs-installer.sh install -t dependencies --venv -y
-        ./habanalabs-installer.sh install -t pytorch --venv -y
+        ./habanalabs-installer.sh install -t dependencies --venv -y | true
+        ./habanalabs-installer.sh install -t pytorch --venv -y | true
+
+        if [ -z "${MILABENCH_HF_TOKEN}" ]; then
+            echo "Missing token"
+        else
+            huggingface-cli login --token $MILABENCH_HF_TOKEN
+        fi
     )
 
     #
     #   Generate/download datasets, download models etc...
     #
-    milabench prepare
+    # sed -i 's/pic.numpy(force=True)/pic.numpy()/' $BENCHMARK_VENV/lib/python3.10/dist-packages/torchvision/transforms/functional.py
+    # sed -i 's/range(hpu.device_count())/range(len(available_modules))/' $BENCHMARK_VENV/lib/site-packages/habana_frameworks/torch/hpu/_utils.py
+    milabench prepare $ARGS
 }
 
 if [ ! -d "$MILABENCH_WORDIR" ]; then
@@ -81,12 +97,28 @@ else
 fi
 
 
+(
+    . $BENCHMARK_VENV/bin/activate
+    pip install lightning-habana
+    pip install habana-media-loader
+    # git clone https://github.com/Delaunay/torchcompat.git
+    # git clone https://github.com/Delaunay/voir.git -b hpu
+    pip uninstall torchcompat voir -y
+    pip install -e $MILABENCH_WORDIR/torchcompat
+    pip install -e $MILABENCH_WORDIR/voir
+    pip install -e $MILABENCH_WORDIR/optimum-habana
+    # pip install habana_dataloader
+)
+
 if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     cd $MILABENCH_WORDIR
 
+    # python -c "import torch; print(torch.__version__)"
+    milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml
+
     #
     #   Run the benchmakrs
-    milabench run "$@"
+    milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml
 
     #
     #   Display report

From 0b93d59cd198efaf6eacdf774b749a07c9bb1b37 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 13:12:35 -0500
Subject: [PATCH 10/20] Rocm (#293)

* ROCm changes

* Update ping

* -

* Cleanup the rocm script

* use rocm branch

* -

* Update run_rocm.sh

---------

Co-authored-by: Your Name <you@example.com>
---
 .pin/constraints-rocm-torch.txt               | 557 +++++++++++---
 benchmarks/brax/benchfile.py                  |   6 +-
 benchmarks/brax/requirements.rocm.txt         |  73 +-
 benchmarks/diffusion/requirements.rocm.txt    |  75 +-
 benchmarks/dinov2/requirements.rocm.txt       |  43 +-
 benchmarks/flops/requirements.rocm.txt        |  33 +-
 benchmarks/geo_gnn/requirements-pre.rocm.txt  |  30 +-
 benchmarks/geo_gnn/requirements.rocm.txt      | 185 +++--
 benchmarks/huggingface/requirements.rocm.txt  |  44 +-
 benchmarks/lightning/requirements.rocm.txt    |  50 +-
 benchmarks/llama/requirements.rocm.txt        |  64 +-
 benchmarks/llava/requirements.rocm.txt        | 293 ++++++++
 benchmarks/llm/requirements.rocm.txt          | 118 ++-
 benchmarks/purejaxrl/benchfile.py             |   4 +-
 benchmarks/purejaxrl/main.py                  |   1 +
 benchmarks/purejaxrl/requirements.rocm.txt    | 693 ++++++++++++++++++
 benchmarks/recursiongfn/requirements.rocm.txt | 328 ++++-----
 benchmarks/rlhf/requirements.in               |   1 +
 benchmarks/rlhf/requirements.rocm.txt         | 313 ++++++++
 benchmarks/timm/requirements.rocm.txt         |  41 +-
 benchmarks/torchatari/requirements.rocm.txt   |  49 +-
 benchmarks/torchvision/requirements.rocm.txt  |  33 +-
 .../torchvision_ddp/requirements.rocm.txt     |  33 +-
 benchmarks/vjepa/requirements.rocm.txt        | 247 +++++++
 config/base.yaml                              |   4 +-
 constraints/rocm.txt                          |  15 +-
 milabench/_version.py                         |   6 +-
 scripts/article/run_cuda.sh                   |   4 +-
 scripts/article/run_rocm.sh                   |  76 +-
 29 files changed, 2723 insertions(+), 696 deletions(-)
 create mode 100644 benchmarks/llava/requirements.rocm.txt
 create mode 100644 benchmarks/purejaxrl/requirements.rocm.txt
 create mode 100644 benchmarks/rlhf/requirements.rocm.txt
 create mode 100644 benchmarks/vjepa/requirements.rocm.txt

diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt
index 4fe6ae9d..ecc49d51 100644
--- a/.pin/constraints-rocm-torch.txt
+++ b/.pin/constraints-rocm-torch.txt
@@ -2,31 +2,39 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in
+#    pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llava/requirements.in benchmarks/llm/requirements.in benchmarks/purejaxrl/requirements.in benchmarks/recursiongfn/requirements.in benchmarks/rlhf/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in benchmarks/vjepa/requirements.in constraints/extra/torch.rocm.txt
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 absl-py==2.1.0
     # via
     #   brax
     #   chex
+    #   distrax
     #   dm-env
     #   ml-collections
     #   mujoco
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
     #   tensorboard
-accelerate==0.33.0
+    #   tensorflow-probability
+accelerate==0.34.2
     # via
     #   -r benchmarks/diffusion/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   diffusers
-aiohappyeyeballs==2.4.0
+    #   trl
+aiohappyeyeballs==2.4.3
     # via aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
     #   datasets
     #   fsspec
+    #   torch-geometric
 aiosignal==1.3.1
     # via aiohttp
 antlr4-python3-runtime==4.9.3
@@ -37,72 +45,137 @@ argklass==1.4.4
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+astroid==3.3.4
+    # via pylint
 asttokens==2.4.1
     # via giving
 async-timeout==4.0.3
     # via aiohttp
 attrs==24.2.0
     # via aiohttp
+beartype==0.19.0
+    # via -r benchmarks/vjepa/requirements.in
+black==24.8.0
+    # via navix
 blinker==1.8.2
     # via flask
-blobfile==2.1.1
-    # via torchtune
+blobfile==3.0.0
+    # via
+    #   -r benchmarks/llm/requirements.txt
+    #   torchtune
+blosc2==2.7.1
+    # via tables
+botorch==0.12.0
+    # via -r benchmarks/recursiongfn/requirements.in
+braceexpand==0.1.7
+    # via
+    #   -r benchmarks/vjepa/requirements.in
+    #   webdataset
 brax==0.10.5
-    # via -r benchmarks/brax/requirements.in
+    # via
+    #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
 cantilever==0.1.0
     # via -r benchmarks/torchatari/requirements.in
-certifi==2024.7.4
-    # via requests
+certifi==2024.8.30
+    # via
+    #   requests
+    #   sentry-sdk
 charset-normalizer==3.3.2
     # via requests
-chex==0.1.86
-    # via optax
+chex==0.1.87
+    # via
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   optax
+    #   rlax
 click==8.1.7
-    # via flask
+    # via
+    #   black
+    #   flask
+    #   wandb
 cloudpickle==3.0.0
     # via
     #   gym
     #   gymnasium
     #   submitit
-codefind==0.1.6
+    #   tensorflow-probability
+codefind==0.1.7
     # via ptera
 contextlib2==21.6.0
     # via ml-collections
-datasets==2.21.0
+contourpy==1.3.0
+    # via matplotlib
+cvxopt==1.3.2
+    # via -r benchmarks/recursiongfn/requirements.in
+cycler==0.12.1
+    # via matplotlib
+datasets==3.0.1
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   torchtune
-diffusers[torch]==0.30.0
+    #   trl
+decorator==5.1.1
+    # via tensorflow-probability
+decord==0.6.0
+    # via -r benchmarks/vjepa/requirements.in
+diffusers[torch]==0.30.3
     # via -r benchmarks/diffusion/requirements.in
 dill==0.3.8
     # via
     #   datasets
     #   multiprocess
+    #   pylint
+distrax==0.1.5
+    # via
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   rlax
 dm-env==1.6
     # via
     #   brax
     #   envpool
+    #   rlax
 dm-tree==0.1.8
-    # via dm-env
+    # via
+    #   dm-env
+    #   tensorflow-probability
+docker-pycreds==0.4.0
+    # via wandb
 docstring-parser==0.16
     # via tyro
+dotmap==1.3.30
+    # via evosax
+einops==0.8.0
+    # via -r benchmarks/vjepa/requirements.in
 envpool==0.8.4
     # via -r benchmarks/torchatari/requirements.in
-etils[epath,epy]==1.7.0
+etils[epath,epy]==1.9.4
     # via
     #   brax
     #   mujoco
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-executing==1.2.0
+evosax==0.1.6
+    # via -r benchmarks/purejaxrl/requirements.in
+exceptiongroup==1.2.2
+    # via pytest
+executing==2.1.0
     # via varname
 fairscale==0.4.13
-    # via -r benchmarks/llama/requirements.in
+    # via
+    #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
 farama-notifications==0.0.4
     # via gymnasium
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   blobfile
     #   datasets
@@ -111,16 +184,30 @@ filelock==3.15.4
     #   pytorch-triton-rocm
     #   torch
     #   transformers
-fire==0.6.0
-    # via -r benchmarks/llama/requirements.in
+fire==0.7.0
+    # via
+    #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+flake8==7.1.1
+    # via navix
+flashbax==0.1.2
+    # via -r benchmarks/purejaxrl/requirements.in
 flask==3.0.3
     # via
     #   brax
     #   flask-cors
-flask-cors==4.0.1
-    # via brax
-flax==0.8.5
+flask-cors==5.0.0
     # via brax
+flax==0.9.0
+    # via
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   navix
+fonttools==4.54.1
+    # via matplotlib
 frozenlist==1.4.1
     # via
     #   aiohttp
@@ -133,92 +220,141 @@ fsspec[http]==2024.6.1
     #   lightning
     #   pytorch-lightning
     #   torch
+    #   torch-geometric
 fvcore==0.1.5.post20221221
     # via -r benchmarks/dinov2/requirements.in
-giving==0.4.2
+gast==0.6.0
+    # via tensorflow-probability
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   wandb
+giving==0.4.3
     # via
     #   ptera
     #   voir
 glfw==2.7.0
     # via mujoco
-grpcio==1.65.5
+gpytorch==1.13
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+grpcio==1.66.2
     # via
     #   brax
     #   tensorboard
-gym==0.23.1
+gym==0.26.2
     # via
     #   -r benchmarks/torchatari/requirements.in
     #   brax
     #   envpool
+    #   gymnax
 gym-notices==0.0.8
     # via gym
 gymnasium==0.29.1
-    # via envpool
+    # via
+    #   envpool
+    #   gymnax
+gymnax==0.0.8
+    # via
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 hjson==3.1.0
     # via argklass
-huggingface-hub==0.24.6
+huggingface-hub==0.25.1
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
     #   diffusers
+    #   timm
     #   tokenizers
     #   torchtune
     #   transformers
 humanize==4.10.0
     # via orbax-checkpoint
-idna==3.7
+idna==3.10
     # via
     #   requests
     #   yarl
-importlib-metadata==8.4.0
+importlib-metadata==8.5.0
     # via diffusers
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   argklass
     #   cantilever
     #   etils
     #   torchcompat
+iniconfig==2.0.0
+    # via pytest
 iopath==0.1.10
     # via
     #   -r benchmarks/dinov2/requirements.in
     #   fvcore
+isort==5.13.2
+    # via pylint
 itsdangerous==2.2.0
     # via flask
-jax==0.4.31
+jax==0.4.33
     # via
     #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
     #   flax
+    #   gymnax
     #   jaxopt
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jaxlib==0.4.31
+    #   rlax
+jaxlib==0.4.33
     # via
     #   brax
     #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
     #   jax
     #   jaxopt
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
 jaxopt==0.8.3
     # via brax
+jaxtyping==0.2.19
+    # via
+    #   gpytorch
+    #   linear-operator
 jinja2==3.1.4
     # via
     #   brax
     #   flask
     #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via scikit-learn
+kiwisolver==1.4.7
+    # via matplotlib
 lightning==2.4.0
     # via -r benchmarks/lightning/requirements.in
-lightning-utilities==0.11.6
+lightning-utilities==0.11.7
     # via
     #   lightning
     #   pytorch-lightning
     #   torchmetrics
-lxml==4.9.4
+linear-operator==0.5.3
+    # via
+    #   botorch
+    #   gpytorch
+lxml==5.3.0
     # via blobfile
 markdown==3.7
     # via tensorboard
@@ -228,169 +364,284 @@ markupsafe==2.1.5
     # via
     #   jinja2
     #   werkzeug
+matplotlib==3.9.2
+    # via
+    #   evosax
+    #   gymnax
+    #   seaborn
+mccabe==0.7.0
+    # via
+    #   flake8
+    #   pylint
 mdurl==0.1.2
     # via markdown-it-py
 ml-collections==0.1.1
     # via brax
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   jax
     #   jaxlib
     #   tensorstore
 mpmath==1.3.0
-    # via sympy
-msgpack==1.0.8
     # via
+    #   botorch
+    #   gpytorch
+    #   linear-operator
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   blosc2
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.2
+mujoco==3.2.3
     # via
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.2
+mujoco-mjx==3.2.3
     # via brax
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   aiohttp
     #   yarl
+multipledispatch==1.0.0
+    # via botorch
 multiprocess==0.70.16
     # via datasets
+mypy-extensions==1.0.0
+    # via black
+navix==0.7.0
+    # via -r benchmarks/purejaxrl/requirements.in
+ndindex==1.9.2
+    # via blosc2
 nest-asyncio==1.6.0
     # via orbax-checkpoint
 networkx==3.3
-    # via torch
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torch
+numexpr==2.10.1
+    # via
+    #   blosc2
+    #   tables
 numpy==1.26.4
     # via
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
     #   -r benchmarks/torchatari/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
+    #   blosc2
     #   brax
     #   chex
+    #   contourpy
     #   datasets
+    #   decord
     #   diffusers
+    #   distrax
     #   dm-env
     #   envpool
+    #   evosax
     #   fairscale
-    #   flax
+    #   flashbax
     #   fvcore
     #   gym
     #   gymnasium
     #   jax
     #   jaxlib
     #   jaxopt
+    #   jaxtyping
+    #   matplotlib
     #   ml-dtypes
     #   mujoco
+    #   navix
+    #   numexpr
     #   opencv-python
-    #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   pandas
     #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   rlax
+    #   scikit-learn
     #   scipy
+    #   seaborn
+    #   tables
     #   tensorboard
     #   tensorboardx
+    #   tensorflow-probability
     #   tensorstore
+    #   torch-geometric
     #   torchmetrics
     #   torchtune
     #   torchvision
     #   transformers
     #   trimesh
+    #   trl
+    #   webdataset
     #   xformers
+nvidia-ml-py==12.560.30
+    # via voir
 omegaconf==2.3.0
     # via
     #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
     #   torchtune
     #   voir
 opencv-python==4.10.0.84
-    # via -r benchmarks/super-slomo/requirements.in
-opt-einsum==3.3.0
-    # via jax
+    # via -r benchmarks/vjepa/requirements.in
+opt-einsum==3.4.0
+    # via
+    #   jax
+    #   pyro-ppl
 optax==0.2.3
     # via
+    #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   flax
-optree==0.12.1
+optree==0.13.0
     # via envpool
-orbax-checkpoint==0.6.0
+orbax-checkpoint==0.6.4
     # via
     #   brax
     #   flax
-ovld==0.3.8
+ovld==0.3.9
     # via voir
 packaging==24.1
     # via
     #   accelerate
+    #   black
     #   datasets
     #   envpool
     #   huggingface-hub
     #   lightning
     #   lightning-utilities
+    #   matplotlib
+    #   pytest
     #   pytorch-lightning
+    #   setuptools-scm
+    #   tables
     #   tensorboard
     #   tensorboardx
     #   torchmetrics
     #   transformers
-pandas==2.2.2
-    # via datasets
+pandas==2.2.3
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+    #   datasets
+    #   seaborn
+pathspec==0.12.1
+    # via black
 pillow==10.4.0
     # via
     #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/llava/requirements.in
     #   brax
     #   diffusers
     #   fvcore
+    #   matplotlib
+    #   navix
+    #   rdkit
     #   torchvision
+platformdirs==4.3.6
+    # via
+    #   black
+    #   pylint
+    #   wandb
+pluggy==1.5.0
+    # via pytest
 portalocker==2.10.1
     # via iopath
-protobuf==5.27.3
+protobuf==5.28.2
     # via
     #   orbax-checkpoint
     #   tensorboard
     #   tensorboardx
+    #   wandb
 psutil==5.9.8
     # via
     #   accelerate
+    #   torch-geometric
     #   voir
+    #   wandb
 ptera==1.4.1
     # via voir
+py-cpuinfo==9.0.0
+    # via
+    #   blosc2
+    #   tables
 pyarrow==17.0.0
-    # via datasets
-pycryptodomex==3.20.0
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   datasets
+pycodestyle==2.12.1
+    # via flake8
+pycryptodomex==3.21.0
     # via blobfile
+pyflakes==3.2.0
+    # via flake8
 pygments==2.18.0
     # via rich
-pynvml==11.5.3
-    # via voir
+pylint==3.3.1
+    # via navix
 pyopengl==3.1.7
     # via mujoco
+pyparsing==3.1.4
+    # via
+    #   matplotlib
+    #   torch-geometric
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+pytest==8.3.3
+    # via navix
 python-dateutil==2.9.0.post0
-    # via pandas
+    # via
+    #   matplotlib
+    #   pandas
 pytinyrenderer==0.0.14
     # via brax
 pytorch-lightning==2.4.0
     # via lightning
 pytorch-triton-rocm==3.0.0
     # via torch
-pytz==2024.1
+pytz==2024.2
     # via pandas
 pyyaml==6.0.2
     # via
     #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
     #   datasets
+    #   evosax
     #   flax
     #   fvcore
+    #   gymnax
     #   huggingface-hub
     #   lightning
     #   ml-collections
     #   omegaconf
     #   orbax-checkpoint
     #   pytorch-lightning
+    #   timm
     #   transformers
+    #   wandb
+    #   webdataset
     #   yacs
+rdkit==2024.3.5
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
 reactivex==4.0.4
     # via giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   diffusers
     #   tiktoken
@@ -401,90 +652,166 @@ requests==2.32.3
     #   diffusers
     #   huggingface-hub
     #   tiktoken
+    #   torch-geometric
     #   transformers
-rich==13.7.1
+    #   wandb
+rich==13.9.1
     # via
     #   flax
     #   tyro
     #   voir
-safetensors==0.4.4
+rlax==0.1.6
+    # via navix
+safetensors==0.4.5
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   diffusers
+    #   timm
     #   torchtune
     #   transformers
-scipy==1.14.0
+scikit-learn==1.5.2
+    # via gpytorch
+scipy==1.14.1
     # via
     #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
     #   brax
+    #   gpytorch
     #   jax
     #   jaxlib
     #   jaxopt
+    #   linear-operator
     #   mujoco-mjx
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-sparse
+seaborn==0.13.2
+    # via gymnax
 sentencepiece==0.2.0
     # via
     #   -r benchmarks/llama/requirements.in
     #   torchtune
+sentry-sdk==2.15.0
+    # via wandb
+setproctitle==1.3.3
+    # via wandb
+setuptools-scm==8.1.0
+    # via navix
 shtab==1.7.1
     # via tyro
 six==1.16.0
     # via
     #   asttokens
-    #   fire
+    #   docker-pycreds
     #   ml-collections
     #   python-dateutil
     #   tensorboard
-submitit==1.5.1
-    # via -r benchmarks/dinov2/requirements.in
-sympy==1.13.2
+    #   tensorflow-probability
+smmap==5.0.1
+    # via gitdb
+submitit==1.5.2
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+sympy==1.13.3
     # via torch
+tables==3.10.1
+    # via -r benchmarks/recursiongfn/requirements.in
 tabulate==0.9.0
     # via fvcore
-tensorboard==2.17.1
-    # via -r benchmarks/torchatari/requirements.in
+tensorboard==2.18.0
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
 tensorboard-data-server==0.7.2
     # via tensorboard
 tensorboardx==2.6.2.2
     # via brax
-tensorstore==0.1.64
+tensorflow-probability==0.24.0
+    # via distrax
+tensorstore==0.1.66
     # via
+    #   flashbax
     #   flax
     #   orbax-checkpoint
 termcolor==2.4.0
     # via
     #   fire
     #   fvcore
+threadpoolctl==3.5.0
+    # via scikit-learn
 tiktoken==0.7.0
     # via torchtune
+timm==1.0.9
+    # via -r benchmarks/vjepa/requirements.in
 tokenizers==0.19.1
     # via transformers
+tomli==2.0.2
+    # via
+    #   black
+    #   pylint
+    #   pytest
+    #   setuptools-scm
+tomlkit==0.13.2
+    # via pylint
 toolz==0.12.1
     # via chex
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -r benchmarks/brax/requirements.in
     #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/geo_gnn/requirements-pre.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llava/requirements.in
     #   -r benchmarks/llm/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
+    #   botorch
     #   diffusers
     #   fairscale
     #   lightning
+    #   linear-operator
+    #   pyro-ppl
     #   pytorch-lightning
+    #   timm
     #   torchmetrics
     #   torchvision
+    #   trl
     #   xformers
+torch-cluster==1.6.3
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.6.1
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-scatter==2.1.2
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-sparse==0.6.18
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
 torchao==0.3.1
-    # via torchtune
+    # via
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llm/requirements.in
+    #   torchtune
 torchcompat==1.1.4
     # via
     #   -c .pin/../constraints/rocm.txt
@@ -493,28 +820,30 @@ torchcompat==1.1.4
     #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchmetrics==1.4.1
+torchmetrics==1.4.2
     # via
     #   -r benchmarks/dinov2/requirements.in
     #   lightning
     #   pytorch-lightning
 torchtune==0.2.1
-    # via -r benchmarks/llm/requirements.in
-torchvision==0.19.0+rocm6.0
+    # via
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llm/requirements.in
+torchvision==0.19.1+rocm6.1
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/lightning/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
 tqdm==4.66.5
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/flops/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
     #   datasets
@@ -522,48 +851,75 @@ tqdm==4.66.5
     #   huggingface-hub
     #   iopath
     #   lightning
+    #   pyro-ppl
     #   pytorch-lightning
+    #   torch-geometric
     #   torchtune
     #   transformers
-transformers==4.44.1
+transformers==4.44.2
     # via
+    #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/llama/requirements.in
-trimesh==4.4.7
+    #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+trimesh==4.4.9
     # via
     #   brax
     #   mujoco-mjx
-types-protobuf==5.27.0.20240626
+trl==0.10.1
+    # via
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/rlhf/requirements.in
+typeguard==4.3.0
+    # via jaxtyping
+types-protobuf==5.28.0.20240924
     # via envpool
 typing-extensions==4.12.2
     # via
+    #   astroid
+    #   black
+    #   botorch
     #   brax
     #   chex
     #   envpool
     #   etils
+    #   flashbax
     #   flax
     #   gymnasium
     #   huggingface-hub
     #   iopath
+    #   jaxtyping
     #   lightning
     #   lightning-utilities
+    #   multidict
+    #   navix
     #   optree
     #   orbax-checkpoint
     #   pytorch-lightning
     #   reactivex
+    #   rich
     #   submitit
+    #   tables
     #   torch
+    #   typeguard
     #   tyro
-tyro==0.8.8
-    # via -r benchmarks/torchatari/requirements.in
-tzdata==2024.1
+tyro==0.8.11
+    # via
+    #   -r benchmarks/torchatari/requirements.in
+    #   navix
+    #   trl
+tzdata==2024.2
     # via pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   blobfile
     #   requests
-varname==0.10.0
+    #   sentry-sdk
+varname==0.13.3
     # via giving
 voir==0.2.19
     # via
@@ -572,28 +928,39 @@ voir==0.2.19
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/geo_gnn/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
+    #   -r benchmarks/llava/requirements.in
     #   -r benchmarks/llm/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-werkzeug==3.0.3
+    #   -r benchmarks/vjepa/requirements.in
+wandb==0.18.3
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   navix
+webdataset==0.2.100
+    # via -r benchmarks/vjepa/requirements.in
+werkzeug==3.0.4
     # via
     #   flask
     #   tensorboard
-xformers==0.0.27.post2
+xformers==0.0.28.post1
     # via -r benchmarks/dinov2/requirements.in
 xxhash==3.5.0
     # via datasets
 yacs==0.1.8
     # via fvcore
-yarl==1.9.4
+yarl==1.13.1
     # via aiohttp
-zipp==3.20.0
+zipp==3.20.2
     # via
     #   etils
     #   importlib-metadata
diff --git a/benchmarks/brax/benchfile.py b/benchmarks/brax/benchfile.py
index 0388956d..c3312813 100644
--- a/benchmarks/brax/benchfile.py
+++ b/benchmarks/brax/benchfile.py
@@ -5,5 +5,9 @@ class BraxBenchmark(Package):
     base_requirements = "requirements.in"
     main_script = "main.py"
 
-
+    def make_env(self):
+        env = super().make_env()
+        env["XLA_PYTHON_CLIENT_PREALLOCATE"] = "False"
+        return env
+    
 __pack__ = BraxBenchmark
diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt
index 0c14e04d..a1923520 100644
--- a/benchmarks/brax/requirements.rocm.txt
+++ b/benchmarks/brax/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/brax/requirements.rocm.txt .pin/tmp-constraints-rocm-brax.txt benchmarks/brax/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 absl-py==2.1.0
     # via
@@ -33,7 +33,7 @@ brax==0.10.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/brax/requirements.in
-chex==0.1.86
+chex==0.1.87
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   optax
@@ -45,7 +45,7 @@ cloudpickle==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gym
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
@@ -61,7 +61,7 @@ dm-tree==0.1.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   dm-env
-etils[epath,epy]==1.7.0
+etils[epath,epy]==1.9.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -69,11 +69,11 @@ etils[epath,epy]==1.7.0
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -83,11 +83,11 @@ flask==3.0.3
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   flask-cors
-flask-cors==4.0.1
+flask-cors==5.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-flax==0.8.5
+flax==0.9.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -96,7 +96,7 @@ fsspec==2024.6.1
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   etils
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
@@ -105,11 +105,11 @@ glfw==2.7.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   mujoco
-grpcio==1.65.5
+grpcio==1.66.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-gym==0.23.1
+gym==0.26.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -121,7 +121,7 @@ humanize==4.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   orbax-checkpoint
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   etils
@@ -129,7 +129,7 @@ itsdangerous==2.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flask
-jax==0.4.31
+jax==0.4.33
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/brax/requirements.in
@@ -140,7 +140,7 @@ jax==0.4.31
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-jaxlib==0.4.31
+jaxlib==0.4.33
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -177,7 +177,7 @@ ml-collections==0.1.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   jax
@@ -187,17 +187,17 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-msgpack==1.0.8
+msgpack==1.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flax
     #   orbax-checkpoint
-mujoco==3.2.2
+mujoco==3.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   mujoco-mjx
-mujoco-mjx==3.2.2
+mujoco-mjx==3.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -215,25 +215,27 @@ numpy==1.26.4
     #   brax
     #   chex
     #   dm-env
-    #   flax
     #   gym
     #   jax
     #   jaxlib
     #   jaxopt
     #   ml-dtypes
     #   mujoco
-    #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   scipy
     #   tensorboardx
     #   tensorstore
     #   trimesh
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-opt-einsum==3.3.0
+opt-einsum==3.4.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   jax
@@ -242,12 +244,12 @@ optax==0.2.3
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   flax
-orbax-checkpoint==0.6.0
+orbax-checkpoint==0.6.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
     #   flax
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -259,7 +261,7 @@ pillow==10.4.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-protobuf==5.27.3
+protobuf==5.28.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   orbax-checkpoint
@@ -276,10 +278,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pyopengl==3.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -303,12 +301,12 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flax
     #   voir
-scipy==1.14.0
+scipy==1.14.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -321,7 +319,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   ml-collections
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -329,7 +327,7 @@ tensorboardx==2.6.2.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
-tensorstore==0.1.64
+tensorstore==0.1.66
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flax
@@ -338,11 +336,11 @@ toolz==0.12.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   chex
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/brax/requirements.in
-trimesh==4.4.7
+trimesh==4.4.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   brax
@@ -356,8 +354,9 @@ typing-extensions==4.12.2
     #   flax
     #   orbax-checkpoint
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
@@ -366,11 +365,11 @@ voir==0.2.19
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/brax/requirements.in
-werkzeug==3.0.3
+werkzeug==3.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   flask
-zipp==3.20.0
+zipp==3.20.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   etils
diff --git a/benchmarks/diffusion/requirements.rocm.txt b/benchmarks/diffusion/requirements.rocm.txt
index 5d0fd6e3..ecedcbb4 100644
--- a/benchmarks/diffusion/requirements.rocm.txt
+++ b/benchmarks/diffusion/requirements.rocm.txt
@@ -4,18 +4,18 @@
 #
 #    pip-compile --output-file=benchmarks/diffusion/requirements.rocm.txt .pin/tmp-constraints-rocm-diffusion-nodes.txt benchmarks/diffusion/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
-accelerate==0.33.0
+accelerate==0.34.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/diffusion/requirements.in
     #   diffusers
-aiohappyeyeballs==2.4.0
+aiohappyeyeballs==2.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -44,7 +44,7 @@ attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-certifi==2024.7.4
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -52,15 +52,15 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/diffusion/requirements.in
-diffusers[torch]==0.30.0
+diffusers[torch]==0.30.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/diffusion/requirements.in
@@ -69,11 +69,11 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -93,7 +93,7 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
@@ -102,7 +102,7 @@ hjson==3.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   argklass
-huggingface-hub==0.24.6
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   accelerate
@@ -110,16 +110,16 @@ huggingface-hub==0.24.6
     #   diffusers
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
     #   yarl
-importlib-metadata==8.4.0
+importlib-metadata==8.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   diffusers
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   argklass
@@ -143,7 +143,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
@@ -166,11 +166,15 @@ numpy==1.26.4
     #   pyarrow
     #   torchvision
     #   transformers
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -181,7 +185,7 @@ packaging==24.1
     #   datasets
     #   huggingface-hub
     #   transformers
-pandas==2.2.2
+pandas==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -207,10 +211,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -219,7 +219,7 @@ pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
@@ -235,7 +235,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   diffusers
@@ -247,11 +247,11 @@ requests==2.32.3
     #   diffusers
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.4
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   accelerate
@@ -262,7 +262,7 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   python-dateutil
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -270,13 +270,13 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   accelerate
     #   diffusers
     #   torchvision
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/diffusion/requirements.in
@@ -287,29 +287,32 @@ tqdm==4.66.5
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.44.1
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/diffusion/requirements.in
 typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
+    #   rich
     #   torch
-tzdata==2024.1
+tzdata==2024.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
@@ -318,11 +321,11 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
-yarl==1.9.4
+yarl==1.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-zipp==3.20.0
+zipp==3.20.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   importlib-metadata
diff --git a/benchmarks/dinov2/requirements.rocm.txt b/benchmarks/dinov2/requirements.rocm.txt
index c46ba981..f8b7f43e 100644
--- a/benchmarks/dinov2/requirements.rocm.txt
+++ b/benchmarks/dinov2/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/dinov2/requirements.rocm.txt .pin/tmp-constraints-rocm-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -18,15 +18,15 @@ cloudpickle==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   submitit
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -39,7 +39,7 @@ fvcore==0.1.5.post20221221
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
@@ -53,7 +53,7 @@ jinja2==3.1.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lightning-utilities==0.11.6
+lightning-utilities==0.11.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchmetrics
@@ -85,12 +85,16 @@ numpy==1.26.4
     #   torchmetrics
     #   torchvision
     #   xformers
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -120,10 +124,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -138,11 +138,11 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-scipy==1.14.0
+scipy==1.14.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
@@ -150,11 +150,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-submitit==1.5.1
+submitit==1.5.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -166,18 +166,18 @@ termcolor==2.4.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   fvcore
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
     #   torchmetrics
     #   torchvision
     #   xformers
-torchmetrics==1.4.1
+torchmetrics==1.4.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
@@ -192,18 +192,19 @@ typing-extensions==4.12.2
     #   iopath
     #   lightning-utilities
     #   reactivex
+    #   rich
     #   submitit
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/dinov2/requirements.in
-xformers==0.0.27.post2
+xformers==0.0.28.post1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/dinov2/requirements.in
diff --git a/benchmarks/flops/requirements.rocm.txt b/benchmarks/flops/requirements.rocm.txt
index d9ac15eb..fbc8952d 100644
--- a/benchmarks/flops/requirements.rocm.txt
+++ b/benchmarks/flops/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/flops/requirements.rocm.txt .pin/tmp-constraints-rocm-flops.txt benchmarks/flops/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -14,15 +14,15 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -31,12 +31,12 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -68,11 +68,15 @@ numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchvision
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -92,10 +96,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -108,7 +108,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -116,11 +116,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/flops/requirements.in
@@ -130,7 +130,7 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/flops/requirements.in
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/flops/requirements.in
@@ -142,8 +142,9 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
diff --git a/benchmarks/geo_gnn/requirements-pre.rocm.txt b/benchmarks/geo_gnn/requirements-pre.rocm.txt
index 3aded346..9b4cf02f 100644
--- a/benchmarks/geo_gnn/requirements-pre.rocm.txt
+++ b/benchmarks/geo_gnn/requirements-pre.rocm.txt
@@ -2,48 +2,48 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.rocm.txt .pin/tmp-constraints-rocm-dimenet.txt benchmarks/geo_gnn/requirements-pre.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
-filelock==3.15.4
+filelock==3.16.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
     #   torch
 fsspec==2024.6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
 jinja2==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
 markupsafe==2.1.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   jinja2
 mpmath==1.3.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
 networkx==3.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
 pytorch-triton-rocm==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-sympy==1.13.2
+sympy==1.13.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.in
 typing-extensions==4.12.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
diff --git a/benchmarks/geo_gnn/requirements.rocm.txt b/benchmarks/geo_gnn/requirements.rocm.txt
index 60246f79..8dfacfe7 100644
--- a/benchmarks/geo_gnn/requirements.rocm.txt
+++ b/benchmarks/geo_gnn/requirements.rocm.txt
@@ -2,271 +2,258 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/geo_gnn/requirements.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.rocm.txt benchmarks/geo_gnn/requirements.in
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements.rocm.txt .pin/tmp-constraints-rocm-dimenet.txt benchmarks/geo_gnn/requirements-pre.rocm.txt benchmarks/geo_gnn/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
-aiohappyeyeballs==2.4.0
+aiohappyeyeballs==2.4.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
 aiosignal==1.3.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
 antlr4-python3-runtime==4.9.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
 asttokens==2.4.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
 async-timeout==4.0.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
 attrs==24.2.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-certifi==2024.7.4
+certifi==2024.8.30
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
 charset-normalizer==3.3.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   pytorch-triton-rocm
     #   torch
 frozenlist==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
     #   aiosignal
 fsspec==2024.6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   torch
     #   torch-geometric
-giving==0.4.2
+giving==0.4.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-idna==3.7
+idna==3.10
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
     #   yarl
 jinja2==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   torch
     #   torch-geometric
-joblib==1.4.2
-    # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   scikit-learn
 markdown-it-py==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
 markupsafe==2.1.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   jinja2
 mdurl==0.1.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   markdown-it-py
 mpmath==1.3.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
     #   yarl
 networkx==3.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   torch
 numpy==1.26.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
     #   pandas
     #   rdkit
-    #   scikit-learn
     #   scipy
     #   torch-geometric
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-pandas==2.2.2
+pandas==2.2.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 pillow==10.4.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rdkit
 psutil==5.9.8
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
     #   voir
 ptera==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
 pygments==2.18.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   voir
-pyparsing==3.1.2
+pyparsing==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
 python-dateutil==2.9.0.post0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
 pytorch-triton-rocm==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   torch
-pytz==2024.1
+pytz==2024.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
 pyyaml==6.0.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
 rdkit==2024.3.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 reactivex==4.0.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
 requests==2.32.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
-rich==13.7.1
+rich==13.9.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-scikit-learn==1.5.1
+scipy==1.14.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   torch-geometric
-scipy==1.14.0
-    # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   scikit-learn
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-cluster
-    #   torch-geometric
     #   torch-sparse
 six==1.16.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   python-dateutil
-sympy==1.13.2
+sympy==1.13.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
     #   torch
-threadpoolctl==3.5.0
+torch==2.4.1+rocm6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   scikit-learn
-torch==2.4.0+rocm6.0
-    # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
 torch-cluster==1.6.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
-torch-geometric==2.5.3
+torch-geometric==2.6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 torch-scatter==2.1.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 torch-sparse==0.6.18
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 tqdm==4.66.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
 typing-extensions==4.12.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.rocm.txt
+    #   multidict
     #   reactivex
+    #   rich
     #   torch
-tzdata==2024.1
+tzdata==2024.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/geo_gnn/requirements.in
-yarl==1.9.4
+yarl==1.13.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
diff --git a/benchmarks/huggingface/requirements.rocm.txt b/benchmarks/huggingface/requirements.rocm.txt
index 1f54d841..653d2c59 100644
--- a/benchmarks/huggingface/requirements.rocm.txt
+++ b/benchmarks/huggingface/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/huggingface/requirements.rocm.txt .pin/tmp-constraints-rocm-hf.txt benchmarks/huggingface/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -14,7 +14,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-certifi==2024.7.4
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -22,15 +22,15 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -42,17 +42,17 @@ fsspec==2024.6.1
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -84,11 +84,15 @@ numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -113,10 +117,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -131,7 +131,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -140,11 +140,11 @@ requests==2.32.3
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.4
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -152,7 +152,7 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -160,7 +160,7 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/huggingface/requirements.in
@@ -169,21 +169,23 @@ tqdm==4.66.5
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   transformers
-transformers==4.44.1
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/huggingface/requirements.in
 typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   reactivex
+    #   rich
     #   torch
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
diff --git a/benchmarks/lightning/requirements.rocm.txt b/benchmarks/lightning/requirements.rocm.txt
index 26fdcedf..aee2b1ba 100644
--- a/benchmarks/lightning/requirements.rocm.txt
+++ b/benchmarks/lightning/requirements.rocm.txt
@@ -4,13 +4,13 @@
 #
 #    pip-compile --output-file=benchmarks/lightning/requirements.rocm.txt .pin/tmp-constraints-rocm-lightning-gpus.txt benchmarks/lightning/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
-aiohappyeyeballs==2.4.0
+aiohappyeyeballs==2.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   fsspec
@@ -34,15 +34,15 @@ attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -58,16 +58,16 @@ fsspec[http]==2024.6.1
     #   lightning
     #   pytorch-lightning
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   yarl
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -79,7 +79,7 @@ lightning==2.4.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/lightning/requirements.in
-lightning-utilities==0.11.6
+lightning-utilities==0.11.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   lightning
@@ -101,7 +101,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
@@ -115,11 +115,15 @@ numpy==1.26.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchmetrics
     #   torchvision
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -146,10 +150,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-lightning==2.4.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -168,7 +168,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -176,11 +176,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/lightning/requirements.in
@@ -193,12 +193,12 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/lightning/requirements.in
-torchmetrics==1.4.1
+torchmetrics==1.4.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   lightning
     #   pytorch-lightning
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/lightning/requirements.in
@@ -212,19 +212,21 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   lightning
     #   lightning-utilities
+    #   multidict
     #   pytorch-lightning
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/lightning/requirements.in
-yarl==1.9.4
+yarl==1.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
diff --git a/benchmarks/llama/requirements.rocm.txt b/benchmarks/llama/requirements.rocm.txt
index 97c44bb0..41a93e55 100644
--- a/benchmarks/llama/requirements.rocm.txt
+++ b/benchmarks/llama/requirements.rocm.txt
@@ -4,13 +4,13 @@
 #
 #    pip-compile --output-file=benchmarks/llama/requirements.rocm.txt .pin/tmp-constraints-rocm-llm.txt benchmarks/llama/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
-aiohappyeyeballs==2.4.0
+aiohappyeyeballs==2.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -35,7 +35,7 @@ attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-certifi==2024.7.4
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -43,11 +43,11 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -56,7 +56,7 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
@@ -64,7 +64,7 @@ fairscale==0.4.13
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -72,7 +72,7 @@ filelock==3.15.4
     #   pytorch-triton-rocm
     #   torch
     #   transformers
-fire==0.6.0
+fire==0.7.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -87,18 +87,18 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   tokenizers
     #   transformers
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -123,7 +123,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
@@ -144,11 +144,15 @@ numpy==1.26.4
     #   pandas
     #   pyarrow
     #   transformers
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -158,7 +162,7 @@ packaging==24.1
     #   datasets
     #   huggingface-hub
     #   transformers
-pandas==2.2.2
+pandas==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -178,10 +182,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -190,7 +190,7 @@ pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
@@ -205,7 +205,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -215,11 +215,11 @@ requests==2.32.3
     #   datasets
     #   huggingface-hub
     #   transformers
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.4
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
@@ -231,9 +231,8 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-    #   fire
     #   python-dateutil
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -245,7 +244,7 @@ tokenizers==0.19.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   transformers
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -256,25 +255,28 @@ tqdm==4.66.5
     #   datasets
     #   huggingface-hub
     #   transformers
-transformers==4.44.1
+transformers==4.44.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/llama/requirements.in
 typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
+    #   rich
     #   torch
-tzdata==2024.1
+tzdata==2024.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
@@ -287,7 +289,7 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
-yarl==1.9.4
+yarl==1.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
diff --git a/benchmarks/llava/requirements.rocm.txt b/benchmarks/llava/requirements.rocm.txt
new file mode 100644
index 00000000..fe11f280
--- /dev/null
+++ b/benchmarks/llava/requirements.rocm.txt
@@ -0,0 +1,293 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/llava/requirements.rocm.txt .pin/tmp-constraints-rocm-llava-single.txt benchmarks/llava/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
+
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llava/requirements.in
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+datasets==3.0.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llava/requirements.in
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   multiprocess
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   pytorch-triton-rocm
+    #   torch
+    #   transformers
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llava/requirements.in
+    #   accelerate
+    #   datasets
+    #   pandas
+    #   pyarrow
+    #   transformers
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llava/requirements.in
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   transformers
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   transformers
+torch==2.4.1+rocm6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llava/requirements.in
+    #   accelerate
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llava/requirements.in
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llava/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
diff --git a/benchmarks/llm/requirements.rocm.txt b/benchmarks/llm/requirements.rocm.txt
index ab5098d0..055089f0 100644
--- a/benchmarks/llm/requirements.rocm.txt
+++ b/benchmarks/llm/requirements.rocm.txt
@@ -4,13 +4,17 @@
 #
 #    pip-compile --output-file=benchmarks/llm/requirements.rocm.txt .pin/tmp-constraints-rocm-llm-full-mp-nodes.txt benchmarks/llm/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
-aiohappyeyeballs==2.4.0
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.in
+aiohappyeyeballs==2.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -39,11 +43,12 @@ attrs==24.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-blobfile==2.1.1
+blobfile==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.txt
     #   torchtune
-certifi==2024.7.4
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -51,11 +56,11 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchtune
@@ -64,11 +69,16 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+fairscale==0.4.13
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blobfile
@@ -76,6 +86,11 @@ filelock==3.15.4
     #   huggingface-hub
     #   pytorch-triton-rocm
     #   torch
+    #   transformers
+fire==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/llm/requirements.txt
 frozenlist==1.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -87,7 +102,7 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
@@ -96,17 +111,20 @@ hjson==3.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   argklass
-huggingface-hub==0.24.6
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
     #   datasets
+    #   tokenizers
     #   torchtune
-idna==3.7
+    #   transformers
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
     #   yarl
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   argklass
@@ -114,7 +132,7 @@ jinja2==3.1.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lxml==4.9.4
+lxml==5.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blobfile
@@ -134,7 +152,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
@@ -150,31 +168,41 @@ networkx==3.3
 numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
     #   datasets
+    #   fairscale
     #   pandas
     #   pyarrow
     #   torchtune
+    #   transformers
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchtune
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
 packaging==24.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
     #   datasets
     #   huggingface-hub
-pandas==2.2.2
+    #   transformers
+pandas==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
 psutil==5.9.8
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
     #   voir
 ptera==1.4.1
     # via
@@ -184,7 +212,7 @@ pyarrow==17.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
-pycryptodomex==3.20.0
+pycryptodomex==3.21.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blobfile
@@ -192,10 +220,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -204,7 +228,7 @@ pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
@@ -212,31 +236,37 @@ pyyaml==6.0.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llm/requirements.in
+    #   accelerate
     #   datasets
     #   huggingface-hub
     #   omegaconf
+    #   transformers
 reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tiktoken
+    #   transformers
 requests==2.32.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   huggingface-hub
     #   tiktoken
-rich==13.7.1
+    #   transformers
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.4
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
     #   torchtune
+    #   transformers
 sentencepiece==0.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -246,25 +276,39 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   python-dateutil
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
+termcolor==2.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   fire
 tiktoken==0.7.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchtune
-torch==2.4.0+rocm6.0
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   transformers
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/llm/requirements.txt
+    #   accelerate
+    #   fairscale
 torchao==0.3.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llm/requirements.in
     #   torchtune
 torchtune==0.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/llm/requirements.in
 tqdm==4.66.5
     # via
@@ -272,26 +316,34 @@ tqdm==4.66.5
     #   datasets
     #   huggingface-hub
     #   torchtune
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/llm/requirements.in
 typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
+    #   rich
     #   torch
-tzdata==2024.1
+tzdata==2024.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blobfile
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
@@ -300,7 +352,7 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
-yarl==1.9.4
+yarl==1.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
diff --git a/benchmarks/purejaxrl/benchfile.py b/benchmarks/purejaxrl/benchfile.py
index 08a51cef..ab1c0ee7 100644
--- a/benchmarks/purejaxrl/benchfile.py
+++ b/benchmarks/purejaxrl/benchfile.py
@@ -18,7 +18,9 @@ class Template(Package):
     def make_env(self):
         # Return a dict of environment variables for prepare_script and
         # main_script.
-        return super().make_env()
+        env = super().make_env()
+        env["XLA_PYTHON_CLIENT_PREALLOCATE"] = "False"
+        return env
 
     async def install(self):
         await super().install()  # super() call installs the requirements
diff --git a/benchmarks/purejaxrl/main.py b/benchmarks/purejaxrl/main.py
index f37c45e0..c3a3630d 100644
--- a/benchmarks/purejaxrl/main.py
+++ b/benchmarks/purejaxrl/main.py
@@ -6,6 +6,7 @@
 
 import argklass
 
+import torch  # This is a bit of a trick to make jax use torch's packaged libs
 
 from dqn import add_dqn_command, main as dqn_main
 from ppo import add_ppo_command, main as ppo_main
diff --git a/benchmarks/purejaxrl/requirements.rocm.txt b/benchmarks/purejaxrl/requirements.rocm.txt
new file mode 100644
index 00000000..226415e0
--- /dev/null
+++ b/benchmarks/purejaxrl/requirements.rocm.txt
@@ -0,0 +1,693 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/purejaxrl/requirements.rocm.txt .pin/tmp-constraints-rocm-ppo.txt benchmarks/purejaxrl/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
+
+absl-py==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   chex
+    #   distrax
+    #   dm-env
+    #   ml-collections
+    #   mujoco
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+    #   rlax
+    #   tensorflow-probability
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+argklass==1.4.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+astroid==3.3.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pylint
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+black==24.8.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+blinker==1.8.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flask
+brax==0.10.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+chex==0.1.87
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   optax
+    #   rlax
+click==8.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   black
+    #   flask
+    #   wandb
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gym
+    #   gymnasium
+    #   tensorflow-probability
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+contextlib2==21.6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ml-collections
+contourpy==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   matplotlib
+cycler==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   matplotlib
+decorator==5.1.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorflow-probability
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pylint
+distrax==0.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   rlax
+dm-env==1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   rlax
+dm-tree==0.1.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   dm-env
+    #   tensorflow-probability
+docker-pycreds==0.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   wandb
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+dotmap==1.3.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   evosax
+etils[epath,epy]==1.9.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   mujoco
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+evosax==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+exceptiongroup==1.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytest
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+farama-notifications==0.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gymnasium
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+    #   torch
+flake8==7.1.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+flashbax==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+flask==3.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   flask-cors
+flask-cors==5.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+flax==0.9.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   navix
+fonttools==4.54.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   matplotlib
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   etils
+    #   torch
+gast==0.6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tensorflow-probability
+gitdb==4.0.11
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gitpython
+gitpython==3.1.43
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   wandb
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+glfw==2.7.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   mujoco
+grpcio==1.66.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+gym==0.26.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   gymnax
+gym-notices==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gym
+gymnasium==0.29.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gymnax
+gymnax==0.0.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+hjson==3.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   argklass
+humanize==4.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   orbax-checkpoint
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+importlib-resources==6.4.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   argklass
+    #   etils
+iniconfig==2.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytest
+isort==5.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pylint
+itsdangerous==2.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flask
+jax==0.4.33
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   flax
+    #   gymnax
+    #   jaxopt
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+    #   rlax
+jaxlib==0.4.33
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   jax
+    #   jaxopt
+    #   mujoco-mjx
+    #   optax
+    #   orbax-checkpoint
+    #   rlax
+jaxopt==0.8.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   flask
+    #   torch
+kiwisolver==1.4.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   matplotlib
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+    #   werkzeug
+matplotlib==3.9.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   evosax
+    #   gymnax
+    #   seaborn
+mccabe==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flake8
+    #   pylint
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+ml-collections==0.1.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jax
+    #   jaxlib
+    #   tensorstore
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flax
+    #   orbax-checkpoint
+mujoco==3.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   mujoco-mjx
+mujoco-mjx==3.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+mypy-extensions==1.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   black
+navix==0.7.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+nest-asyncio==1.6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   orbax-checkpoint
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   chex
+    #   contourpy
+    #   distrax
+    #   dm-env
+    #   evosax
+    #   flashbax
+    #   gym
+    #   gymnasium
+    #   jax
+    #   jaxlib
+    #   jaxopt
+    #   matplotlib
+    #   ml-dtypes
+    #   mujoco
+    #   navix
+    #   optax
+    #   orbax-checkpoint
+    #   pandas
+    #   rlax
+    #   scipy
+    #   seaborn
+    #   tensorboardx
+    #   tensorflow-probability
+    #   tensorstore
+    #   trimesh
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+opt-einsum==3.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jax
+optax==0.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   flax
+orbax-checkpoint==0.6.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   flax
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   black
+    #   matplotlib
+    #   pytest
+    #   setuptools-scm
+    #   tensorboardx
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   seaborn
+pathspec==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   black
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   matplotlib
+    #   navix
+platformdirs==4.3.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   black
+    #   pylint
+    #   wandb
+pluggy==1.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytest
+protobuf==5.28.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   orbax-checkpoint
+    #   tensorboardx
+    #   wandb
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+    #   wandb
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pycodestyle==2.12.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flake8
+pyflakes==3.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flake8
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pylint==3.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+pyopengl==3.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   mujoco
+pyparsing==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   matplotlib
+pytest==8.3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   matplotlib
+    #   pandas
+pytinyrenderer==0.0.14
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   evosax
+    #   flax
+    #   gymnax
+    #   ml-collections
+    #   omegaconf
+    #   orbax-checkpoint
+    #   wandb
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   wandb
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flax
+    #   tyro
+    #   voir
+rlax==0.1.6
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   jax
+    #   jaxlib
+    #   jaxopt
+    #   mujoco-mjx
+seaborn==0.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gymnax
+sentry-sdk==2.15.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   wandb
+setproctitle==1.3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   wandb
+setuptools-scm==8.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   docker-pycreds
+    #   ml-collections
+    #   python-dateutil
+    #   tensorflow-probability
+smmap==5.0.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gitdb
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tensorboardx==2.6.2.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+tensorflow-probability==0.24.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   distrax
+tensorstore==0.1.66
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flashbax
+    #   flax
+    #   orbax-checkpoint
+tomli==2.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   black
+    #   pylint
+    #   pytest
+    #   setuptools-scm
+tomlkit==0.13.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pylint
+toolz==0.12.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   chex
+torch==2.4.1+rocm6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+trimesh==4.4.9
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   brax
+    #   mujoco-mjx
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   astroid
+    #   black
+    #   brax
+    #   chex
+    #   etils
+    #   flashbax
+    #   flax
+    #   gymnasium
+    #   navix
+    #   orbax-checkpoint
+    #   reactivex
+    #   rich
+    #   torch
+    #   tyro
+tyro==0.8.11
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+    #   sentry-sdk
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+wandb==0.18.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   navix
+werkzeug==3.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   flask
+zipp==3.20.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   etils
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/benchmarks/recursiongfn/requirements.rocm.txt b/benchmarks/recursiongfn/requirements.rocm.txt
index 1bc73f14..bcb64cdb 100644
--- a/benchmarks/recursiongfn/requirements.rocm.txt
+++ b/benchmarks/recursiongfn/requirements.rocm.txt
@@ -2,201 +2,198 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/recursiongfn/requirements.rocm.txt .pin/tmp-constraints-rocm-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in
+#    pip-compile --output-file=benchmarks/recursiongfn/requirements.rocm.txt .pin/tmp-constraints-rocm-recursiongfn.txt benchmarks/recursiongfn/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 absl-py==2.1.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
-aiohappyeyeballs==2.4.0
+aiohappyeyeballs==2.4.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
-aiohttp==3.10.5
+aiohttp==3.10.8
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
 aiosignal==1.3.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
 antlr4-python3-runtime==4.9.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
 asttokens==2.4.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
 async-timeout==4.0.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
 attrs==24.2.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
 blosc2==2.7.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tables
-botorch==0.11.3
+botorch==0.12.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
-certifi==2024.7.4
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+certifi==2024.8.30
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
     #   sentry-sdk
 charset-normalizer==3.3.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
 click==8.1.7
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   wandb
-codefind==0.1.6
+codefind==0.1.7
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
 cvxopt==1.3.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 docker-pycreds==0.4.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   wandb
-executing==1.2.0
+executing==2.1.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
     #   torch
 frozenlist==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
     #   aiosignal
 fsspec==2024.6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
     #   torch-geometric
-gflownet @ git+https://github.com/Delaunay/gflownet@milabench
-    # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   -r benchmarks/recursiongfn/requirements.in
 gitdb==4.0.11
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gitpython
 gitpython==3.1.43
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
     #   wandb
-giving==0.4.2
+giving==0.4.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-gpytorch==1.12
+gpytorch==1.13
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
-    #   gflownet
-grpcio==1.65.5
+grpcio==1.66.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
-idna==3.7
+idna==3.10
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
     #   yarl
-jaxtyping==0.2.33
+jaxtyping==0.2.19
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   gpytorch
     #   linear-operator
 jinja2==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
     #   torch-geometric
 joblib==1.4.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   scikit-learn
-linear-operator==0.5.2
+linear-operator==0.5.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   botorch
     #   gpytorch
 markdown==3.7
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
 markdown-it-py==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
 markupsafe==2.1.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   jinja2
     #   werkzeug
 mdurl==0.1.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   markdown-it-py
 mpmath==1.3.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   botorch
     #   gpytorch
+    #   linear-operator
     #   sympy
-msgpack==1.0.8
+msgpack==1.1.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blosc2
-multidict==6.0.5
+multidict==6.1.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
     #   yarl
 multipledispatch==1.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   botorch
-ndindex==1.8
+ndindex==1.9.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blosc2
 networkx==3.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
     #   torch
 numexpr==2.10.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blosc2
     #   tables
 numpy==1.26.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blosc2
-    #   botorch
+    #   jaxtyping
     #   numexpr
-    #   opt-einsum
     #   pandas
     #   pyarrow
     #   pyro-ppl
@@ -206,239 +203,240 @@ numpy==1.26.4
     #   tables
     #   tensorboard
     #   torch-geometric
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
     #   voir
-opt-einsum==3.3.0
+opt-einsum==3.4.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pyro-ppl
-ovld==0.3.8
+ovld==0.3.9
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
 packaging==24.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tables
     #   tensorboard
-pandas==2.2.2
+pandas==2.2.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 pillow==10.4.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rdkit
-platformdirs==4.2.2
+platformdirs==4.3.6
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   wandb
-protobuf==5.27.3
+protobuf==5.28.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
     #   wandb
 psutil==5.9.8
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
     #   voir
     #   wandb
 ptera==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
 py-cpuinfo==9.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   blosc2
     #   tables
 pyarrow==17.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 pygments==2.18.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   voir
-pyparsing==3.1.2
+pyparsing==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
 pyro-api==0.1.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pyro-ppl
 pyro-ppl==1.9.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
-    #   gflownet
 python-dateutil==2.9.0.post0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
 pytorch-triton-rocm==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-pytz==2024.1
+pytz==2024.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
 pyyaml==6.0.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   omegaconf
     #   wandb
 rdkit==2024.3.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 reactivex==4.0.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
 requests==2.32.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch-geometric
     #   wandb
-rich==13.7.1
+rich==13.9.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-scikit-learn==1.5.1
+scikit-learn==1.5.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gpytorch
-    #   torch-geometric
-scipy==1.14.0
+scipy==1.14.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
-    #   gflownet
     #   gpytorch
     #   linear-operator
     #   scikit-learn
     #   torch-cluster
-    #   torch-geometric
     #   torch-sparse
-sentry-sdk==2.13.0
+sentry-sdk==2.15.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   wandb
 setproctitle==1.3.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   wandb
 six==1.16.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   docker-pycreds
     #   python-dateutil
     #   tensorboard
 smmap==5.0.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gitdb
-sympy==1.13.2
+sympy==1.13.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
 tables==3.10.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
-tensorboard==2.17.1
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+tensorboard==2.18.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 tensorboard-data-server==0.7.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
 threadpoolctl==3.5.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   scikit-learn
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
-    #   gflownet
     #   linear-operator
     #   pyro-ppl
 torch-cluster==1.6.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
-torch-geometric==2.5.3
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.6.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 torch-scatter==2.1.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 torch-sparse==0.6.18
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
 tqdm==4.66.5
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pyro-ppl
     #   torch-geometric
-typeguard==2.13.3
+typeguard==4.3.0
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   jaxtyping
-    #   linear-operator
 typing-extensions==4.12.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   botorch
+    #   jaxtyping
+    #   multidict
     #   reactivex
+    #   rich
     #   tables
     #   torch
-tzdata==2024.1
+    #   typeguard
+tzdata==2024.2
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
     #   sentry-sdk
-varname==0.10.0
+varname==0.13.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/recursiongfn/requirements.in
-wandb==0.17.7
+wandb==0.18.3
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
-    #   gflownet
-werkzeug==3.0.3
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/recursiongfn/requirements.in
+werkzeug==3.0.4
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
-yarl==1.9.4
+yarl==1.13.1
     # via
-    #   -c .pin/../.pin/constraints-rocm-gnn.txt
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
 
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/benchmarks/rlhf/requirements.in b/benchmarks/rlhf/requirements.in
index 045bca09..1cb6cd24 100644
--- a/benchmarks/rlhf/requirements.in
+++ b/benchmarks/rlhf/requirements.in
@@ -4,3 +4,4 @@ trl
 accelerate
 transformers
 datasets
+einops
\ No newline at end of file
diff --git a/benchmarks/rlhf/requirements.rocm.txt b/benchmarks/rlhf/requirements.rocm.txt
new file mode 100644
index 00000000..5b7f2726
--- /dev/null
+++ b/benchmarks/rlhf/requirements.rocm.txt
@@ -0,0 +1,313 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/rlhf/requirements.rocm.txt .pin/tmp-constraints-rocm-rlhf-gpus.txt benchmarks/rlhf/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
+
+accelerate==0.34.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+aiohappyeyeballs==2.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+aiohttp==3.10.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+async-timeout==4.0.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+attrs==24.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+datasets==3.0.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+dill==0.3.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   multiprocess
+docstring-parser==0.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   pytorch-triton-rocm
+    #   torch
+    #   transformers
+frozenlist==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+multidict==6.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   pandas
+    #   pyarrow
+    #   transformers
+    #   trl
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pyarrow==17.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   omegaconf
+    #   transformers
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   transformers
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   accelerate
+    #   transformers
+shtab==1.7.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   tyro
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   python-dateutil
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+tokenizers==0.19.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   transformers
+torch==2.4.1+rocm6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   accelerate
+    #   trl
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.44.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
+trl==0.10.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/rlhf/requirements.in
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   multidict
+    #   reactivex
+    #   rich
+    #   torch
+    #   tyro
+tyro==0.8.11
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   trl
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/rlhf/requirements.in
+xxhash==3.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   datasets
+yarl==1.13.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   aiohttp
+einops
\ No newline at end of file
diff --git a/benchmarks/timm/requirements.rocm.txt b/benchmarks/timm/requirements.rocm.txt
index 8383f9e6..18e83d95 100644
--- a/benchmarks/timm/requirements.rocm.txt
+++ b/benchmarks/timm/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/timm/requirements.rocm.txt .pin/tmp-constraints-rocm-timm.txt benchmarks/timm/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -14,7 +14,7 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-certifi==2024.7.4
+certifi==2024.8.30
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -22,15 +22,15 @@ charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -41,16 +41,16 @@ fsspec==2024.6.1
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.25.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
-idna==3.7
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -82,11 +82,15 @@ numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchvision
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -110,10 +114,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -132,11 +132,11 @@ requests==2.32.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-safetensors==0.4.4
+safetensors==0.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -144,16 +144,16 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
     #   torchvision
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/timm/requirements.in
@@ -166,12 +166,13 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
     #   reactivex
+    #   rich
     #   torch
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
diff --git a/benchmarks/torchatari/requirements.rocm.txt b/benchmarks/torchatari/requirements.rocm.txt
index 71fd92e5..76fa829c 100644
--- a/benchmarks/torchatari/requirements.rocm.txt
+++ b/benchmarks/torchatari/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/torchatari/requirements.rocm.txt .pin/tmp-constraints-rocm-torchatari.txt benchmarks/torchatari/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 absl-py==2.1.0
     # via
@@ -32,7 +32,7 @@ cloudpickle==3.0.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gym
     #   gymnasium
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
@@ -52,7 +52,7 @@ envpool==0.8.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchatari/requirements.in
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
@@ -60,7 +60,7 @@ farama-notifications==0.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   gymnasium
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -69,16 +69,16 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-grpcio==1.65.5
+grpcio==1.66.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
-gym==0.23.1
+gym==0.26.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchatari/requirements.in
@@ -91,7 +91,7 @@ gymnasium==0.29.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   envpool
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   cantilever
@@ -134,15 +134,19 @@ numpy==1.26.4
     #   gym
     #   gymnasium
     #   tensorboard
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-optree==0.12.1
+optree==0.13.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   envpool
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -151,7 +155,7 @@ packaging==24.1
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   envpool
     #   tensorboard
-protobuf==5.27.3
+protobuf==5.28.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
@@ -167,10 +171,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -183,7 +183,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tyro
@@ -197,11 +197,11 @@ six==1.16.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
     #   tensorboard
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-tensorboard==2.17.1
+tensorboard==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchatari/requirements.in
@@ -209,7 +209,7 @@ tensorboard-data-server==0.7.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchatari/requirements.in
@@ -218,7 +218,7 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/torchatari/requirements.in
-types-protobuf==5.27.0.20240626
+types-protobuf==5.28.0.20240924
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   envpool
@@ -229,22 +229,23 @@ typing-extensions==4.12.2
     #   gymnasium
     #   optree
     #   reactivex
+    #   rich
     #   torch
     #   tyro
-tyro==0.8.8
+tyro==0.8.11
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchatari/requirements.in
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir==0.2.17
+voir==0.2.19
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/torchatari/requirements.in
-werkzeug==3.0.3
+werkzeug==3.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
diff --git a/benchmarks/torchvision/requirements.rocm.txt b/benchmarks/torchvision/requirements.rocm.txt
index 094eb29b..08dfdebf 100644
--- a/benchmarks/torchvision/requirements.rocm.txt
+++ b/benchmarks/torchvision/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -14,15 +14,15 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -31,12 +31,12 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -68,11 +68,15 @@ numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchvision
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -92,10 +96,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -108,7 +108,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -116,11 +116,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision/requirements.in
@@ -130,7 +130,7 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/torchvision/requirements.in
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision/requirements.in
@@ -142,8 +142,9 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
diff --git a/benchmarks/torchvision_ddp/requirements.rocm.txt b/benchmarks/torchvision_ddp/requirements.rocm.txt
index d1241db8..9eed9442 100644
--- a/benchmarks/torchvision_ddp/requirements.rocm.txt
+++ b/benchmarks/torchvision_ddp/requirements.rocm.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=benchmarks/torchvision_ddp/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision_ddp/requirements.in
 #
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 antlr4-python3-runtime==4.9.3
     # via
@@ -14,15 +14,15 @@ asttokens==2.4.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-codefind==0.1.6
+codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.15.4
+filelock==3.16.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -31,12 +31,12 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-importlib-resources==6.4.3
+importlib-resources==6.4.5
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchcompat
@@ -68,11 +68,15 @@ numpy==1.26.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchvision
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
 omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-ovld==0.3.8
+ovld==0.3.9
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -92,10 +96,6 @@ pygments==2.18.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   rich
-pynvml==11.5.3
-    # via
-    #   -c .pin/../.pin/constraints-rocm-torch.txt
-    #   voir
 pytorch-triton-rocm==3.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
@@ -108,7 +108,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-rich==13.7.1
+rich==13.9.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
@@ -116,11 +116,11 @@ six==1.16.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   asttokens
-sympy==1.13.2
+sympy==1.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-torch==2.4.0+rocm6.0
+torch==2.4.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
@@ -130,7 +130,7 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -c .pin/../constraints/rocm.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchvision==0.19.0+rocm6.0
+torchvision==0.19.1+rocm6.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   -r benchmarks/torchvision_ddp/requirements.in
@@ -142,8 +142,9 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   reactivex
+    #   rich
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
diff --git a/benchmarks/vjepa/requirements.rocm.txt b/benchmarks/vjepa/requirements.rocm.txt
new file mode 100644
index 00000000..a473fac7
--- /dev/null
+++ b/benchmarks/vjepa/requirements.rocm.txt
@@ -0,0 +1,247 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/vjepa/requirements.rocm.txt .pin/tmp-constraints-rocm-vjepa-gpus.txt benchmarks/vjepa/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
+
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+beartype==0.19.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+braceexpand==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   webdataset
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   submitit
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+decord==0.6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+einops==0.8.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.16.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   pytorch-triton-rocm
+    #   torch
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.25.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   timm
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   decord
+    #   opencv-python
+    #   pandas
+    #   torchvision
+    #   webdataset
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+opencv-python==4.10.0.84
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+pandas==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pytorch-triton-rocm==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   huggingface-hub
+    #   omegaconf
+    #   timm
+    #   webdataset
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+rich==13.9.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   timm
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+    #   python-dateutil
+submitit==1.5.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+sympy==1.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+timm==1.0.9
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+torch==2.4.1+rocm6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+    #   torchvision
+torchvision==0.19.1+rocm6.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   huggingface-hub
+    #   reactivex
+    #   rich
+    #   submitit
+    #   torch
+tzdata==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -c .pin/../constraints/rocm.txt
+    #   -r benchmarks/vjepa/requirements.in
+webdataset==0.2.100
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
diff --git a/config/base.yaml b/config/base.yaml
index 1a64d550..b9b104d7 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -66,7 +66,7 @@ llama:
   definition: ../benchmarks/llama
   group: llm
   install_group: torch
-  max_duration: 800
+  max_duration: 3600
   tags:
     - nlp
     - llm
@@ -700,6 +700,7 @@ _purejaxrl:
     - monogpu
     - gym
     - rl
+    - jax
   definition: ../benchmarks/purejaxrl
   plan:
     method: per_gpu
@@ -891,6 +892,7 @@ cleanrljax:
   definition: ../benchmarks/cleanrl_jax
   tags:
     - monogpu
+    - jax
   plan:
     method: per_gpu
   
diff --git a/constraints/rocm.txt b/constraints/rocm.txt
index b86ce00d..cc158557 100644
--- a/constraints/rocm.txt
+++ b/constraints/rocm.txt
@@ -1,7 +1,20 @@
---extra-index-url https://download.pytorch.org/whl/rocm6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.1
 
 #
 #
 voir >= 0.2.19
 torchcompat >= 1.0.0
 gymnax >= 0.0.8
+
+
+trl<0.11.0
+
+# latest torchtune is slower than before and cause failures
+# next version of pytorch seems to work better
+# so pending a new version of pytorch this is what we get
+torchtune<0.3.0
+
+# transformers added torchao support recently
+# but only the most recent version we do not support
+transformers<4.45.0
+torchao
\ No newline at end of file
diff --git a/milabench/_version.py b/milabench/_version.py
index a5982314..b07d8b5e 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-50-gd2c8ba2"
-__commit__ = "d2c8ba2c67e19026293381bdbddeb3f30ba0ee64"
-__date__ = "2024-11-11 18:51:45 +0000"
+__tag__ = "v1.0.0_RC1-13-gde92a7e"
+__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c"
+__date__ = "2024-10-03 15:48:10 +0000"
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index 12ffffe6..0c2c1dae 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -9,7 +9,7 @@ export MILABENCH_BASE="$MILABENCH_WORDIR/results"
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
 export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
 export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml"
-
+    
 
 if [ -z "${MILABENCH_PREPARE}" ]; then
     export MILABENCH_PREPARE=0
@@ -84,6 +84,8 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
 
     . $MILABENCH_WORDIR/env/bin/activate
 
+
+
     # pip install torch
     # milabench pin --variant cuda --from-scratch 
     # rm -rf $MILABENCH_WORDIR/results/venv/
diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh
index b8a15fb7..fbb9da83 100644
--- a/scripts/article/run_rocm.sh
+++ b/scripts/article/run_rocm.sh
@@ -2,14 +2,30 @@
 
 set -ex
 
+# sudo usermod -a -G render,video $LOGNAME
+# sudo chmod u+s /opt/rocm-6.2.2/lib/llvm/bin/amdgpu-arch
+
 export MILABENCH_GPU_ARCH=rocm
 export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH"
-
+export ROCM_PATH="/opt/rocm"
 export MILABENCH_BASE="$MILABENCH_WORDIR/results"
-export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
 export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
 
+if [ -z "${MILABENCH_SOURCE}" ]; then
+    export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
+else
+    export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml"
+fi
+
+
+export GPU="$(/opt/rocm/lib/llvm/bin/amdgpu-arch | head -n 1)"
+export TORCH_ROCM_ARCH_LIST="$GPU"
+export ROCM_TARGETS="$GPU"
+export PYTORCH_ROCM_ARCH="$GPU"
+
+
+ARGS="$@"
 
 install_prepare() {
     mkdir -p $MILABENCH_WORDIR
@@ -17,15 +33,23 @@ install_prepare() {
 
     virtualenv $MILABENCH_WORDIR/env
 
-    git clone https://github.com/mila-iqia/milabench.git
+    if [ -z "${MILABENCH_SOURCE}" ]; then
+        if [ ! -d "$MILABENCH_WORDIR/milabench" ]; then
+            git clone https://github.com/mila-iqia/milabench.git -b rocm
+        fi
+        export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench"
+    fi
 
     . $MILABENCH_WORDIR/env/bin/activate
-    pip install -e $MILABENCH_WORDIR/milabench
+    pip install -e $MILABENCH_SOURCE
 
+    
     #
     # Install milabench's benchmarks in their venv
     #
-    milabench install
+    # pip install torch --index-url https://download.pytorch.org/whl/rocm6.1
+    # milabench pin --variant rocm --from-scratch $ARGS 
+    milabench install $ARGS 
 
     #
     # Override/add package to milabench venv here
@@ -36,35 +60,48 @@ install_prepare() {
     (
         . $BENCHMARK_VENV/bin/activate
 
+        pip install ninja
+
+        if [ -z "${MILABENCH_HF_TOKEN}" ]; then
+            echo "Missing token"
+        else
+            huggingface-cli login --token $MILABENCH_HF_TOKEN
+        fi
+
         #
         # Override/add package to the benchmark venv here
         #
         which pip
-        pip uninstall torch torchvision torchaudio
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
-        pip uninstall pynvml
 
-        # sudo apt-get install lld
         # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30
-        # does not really work
         pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl
         pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz
 
-        # 
-        FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git
-        FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_scatter.git
-        FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_sparse.git
+        pip uninstall torch_cluster torch_scatter torch_sparse -y
+        FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git
+        FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_scatter.git
+        FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_sparse.git
 
         # takes forever to compile
         # https://github.com/ROCm/xformers
-        pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/xformers.git@develop#egg=xformers
-        pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/flash-attention.git
+        pip uninstall xformers
+        pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1
+        # pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/xformers.git@develop#egg=xformers
+        # pip install -v -U --no-build-isolation --no-deps git+https://github.com/facebookresearch/xformers.git
+        # pip install xformers -U --index-url https://download.pytorch.org/whl/rocm6.1
+
+        pip uninstall flash-attention
+        pip install -v -U --no-build-isolation --use-pep517 --no-deps git+https://github.com/ROCm/flash-attention.git 
+        pip uninstall pynvml nvidia-ml-py -y
+
+        pip install einops
     )
 
+    pip uninstall pynvml nvidia-ml-py -y
     #
     #   Generate/download datasets, download models etc...
     #
-    milabench prepare
+    milabench prepare $ARGS 
 }
 
 if [ ! -d "$MILABENCH_WORDIR" ]; then
@@ -74,11 +111,12 @@ else
     . $MILABENCH_WORDIR/env/bin/activate
 fi
 
-cd $MILABENCH_WORDIR
+
+milabench prepare $ARGS 
 
 #
 #   Run the benchmakrs
-milabench run "$@"
+milabench run $ARGS 
 
 #
 #   Display report

From 491505f0c72248824a26926df05e22f4c764ecbe Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 13:15:47 -0500
Subject: [PATCH 11/20] Multirun system (#308)

* ROCm changes

* Update ping

* -

* Cleanup the rocm script

* use rocm branch

* -

* Ne wmulti run system

* multinode tweaks

* make sure system config is applied before running

* Update matrix run

* Tweaks

---------

Co-authored-by: Your Name <you@example.com>
---
 .../llm/recipes/full_finetune_distributed.py  |  1 -
 config/base.yaml                              |  2 +-
 config/examples/system.yaml                   | 30 +++++++++
 config/scaling.yaml                           |  2 +
 milabench/_version.py                         |  7 ++-
 milabench/alt_async.py                        |  2 +
 milabench/cli/run.py                          | 42 +++++++++----
 milabench/commands/__init__.py                |  8 ++-
 milabench/compare.py                          |  2 +
 milabench/config.py                           | 10 ++-
 milabench/remote.py                           |  4 +-
 milabench/sizer.py                            | 25 +++++---
 milabench/system.py                           | 63 ++++++++++++++++++-
 scripts/article/run_rocm.sh                   | 25 +++++++-
 tests/test_system_matrix.py                   | 40 ++++++++++++
 15 files changed, 226 insertions(+), 37 deletions(-)
 create mode 100644 tests/test_system_matrix.py

diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py
index 19556ec7..f8d58e2f 100755
--- a/benchmarks/llm/recipes/full_finetune_distributed.py
+++ b/benchmarks/llm/recipes/full_finetune_distributed.py
@@ -100,7 +100,6 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-
         import os
         self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0")))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
diff --git a/config/base.yaml b/config/base.yaml
index b9b104d7..38dfc4d3 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -208,7 +208,7 @@ resnet50-noio:
   inherits: _torchvision
   voir:
     options:
-      stop: 1000
+      stop: 500
       interval: "1s"
 
   tags:
diff --git a/config/examples/system.yaml b/config/examples/system.yaml
index 7b84c48d..78cf3957 100644
--- a/config/examples/system.yaml
+++ b/config/examples/system.yaml
@@ -26,3 +26,33 @@ system:
       ip: 192.168.11.13
       main: false 
       user: username
+
+
+
+
+multirun:
+  runs:
+    # Force batch size to populate the sizing model
+    - name: "bs{sizer.batch_size}"
+      matrix:
+        sizer.auto: 1
+        sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128]
+        sizer.save: ["scaling.yaml"]
+    
+    # Matrix run
+    - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}"
+      matrix:
+        cpu.auto: 1
+        cpu.n_workers: [2, 4, 8, 16, 32]
+        sizer.auto: 1
+        sizer.capacity: [4Go, 8Go, 16Go, 32Go, 64Go, All]
+        sizer.multiple: 8
+        sizer.save: ["scaling.yaml"]
+
+    # Auto run
+    - name: "auto"
+      matrix:
+        cpu.auto: 1
+        sizer.auto: 1
+        sizer.multiple: 8
+        sizer.save: ["scaling.yaml"]
diff --git a/config/scaling.yaml b/config/scaling.yaml
index d9d3dbf9..00a37bd8 100644
--- a/config/scaling.yaml
+++ b/config/scaling.yaml
@@ -286,7 +286,9 @@ lightning-gpus:
     112: 16776.25 MiB
     128: 15858 MiB
     240: 28942.25 MiB
+    256: 77822 MiB
     504: 54100.25 MiB
+    616: 93571 MiB
     624: 65386.25 MiB
   optimized: 16
 llama: {}
diff --git a/milabench/_version.py b/milabench/_version.py
index b07d8b5e..6f09fef1 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,6 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-13-gde92a7e"
-__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c"
-__date__ = "2024-10-03 15:48:10 +0000"
+__tag__ = "v1.0.0_RC1-18-g784b38e"
+__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179"
+__date__ = "2024-10-18 15:58:46 +0000"
+
diff --git a/milabench/alt_async.py b/milabench/alt_async.py
index 8608196d..6fc9f64c 100644
--- a/milabench/alt_async.py
+++ b/milabench/alt_async.py
@@ -190,6 +190,8 @@ def run(argv, setsid=None, process_accumulator=None, info={}, **kwargs):
             destroy(*mx.processes)
         yield entry
         
+    # mx.close()
+        
 
 def proceed(coro):
     loop = FeedbackEventLoop()
diff --git a/milabench/cli/run.py b/milabench/cli/run.py
index f5e75b70..f04427af 100644
--- a/milabench/cli/run.py
+++ b/milabench/cli/run.py
@@ -23,6 +23,7 @@
 from ..report import make_report
 from ..sizer import MemoryUsageExtractor
 from ..summary import make_summary
+from ..system import multirun, apply_system, SizerOptions, option
 
 
 # fmt: off
@@ -72,12 +73,7 @@ def _fetch_arch(mp):
         return None
     
 
-@tooled
-def cli_run(args=None):
-    """Run the benchmarks."""
-    if args is None:
-        args = arguments()
-
+def run(mp, args, name):
     layers = validation_names(args.validations)
 
     dash_class = {
@@ -85,13 +81,7 @@ def cli_run(args=None):
         "long": LongDashFormatter,
         "no": None,
     }.get(args.dash, None)
-
-    mp = get_multipack(run_name=args.run_name)
-    arch = _fetch_arch(mp)
-
-    # Initialize the backend here so we can retrieve GPU stats
-    init_arch(arch)
-
+        
     success = run_with_loggers(
         mp.do_run(repeat=args.repeat),
         loggers=[
@@ -136,3 +126,29 @@ def cli_run(args=None):
             )
 
     return success
+
+
+@tooled
+def cli_run(args=None):
+    """Run the benchmarks."""
+    if args is None:
+        args = arguments()
+
+    # Load the configuration and system
+    mp = get_multipack(run_name=args.run_name)
+    arch = _fetch_arch(mp)
+
+    # Initialize the backend here so we can retrieve GPU stats
+    init_arch(arch)
+    
+    success = 0
+    for name, conf in multirun():
+        run_name = name or args.run_name
+        
+        # Note that this function overrides the system config
+        mp = get_multipack(run_name=run_name)
+        
+        with apply_system(conf):
+            success += run(mp, args, run_name)
+    
+    return success
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index e97ac4e5..4a8f1e90 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -451,6 +451,11 @@ def _find_node_config(self) -> Dict:
         return {}
 
     def is_local(self):
+        local = self._is_local()
+        print("is_local", self.host, local)
+        return local
+
+    def _is_local(self):
         localnode = self.pack.config["system"]["self"]
 
         if localnode is not None:
@@ -581,7 +586,7 @@ def node_address(node):
     """Favour Hostname as it is the most consistent name across machines"""
     host = node.get("hostname")
     ip = node.get("ip")
-    return host or ip
+    return ip or hostname
 
 
 class ForeachNode(ListCommand):
@@ -637,6 +642,7 @@ def executors(self):
                     **self.options
                 )
 
+            print(rank, node, node_address(node))
             worker = SSHCommand(
                 host=node_address(node),
                 user=node["user"],
diff --git a/milabench/compare.py b/milabench/compare.py
index d4d6299e..32f95c64 100644
--- a/milabench/compare.py
+++ b/milabench/compare.py
@@ -26,6 +26,7 @@ def fetch_runs(folder, filter):
 
     runs = []
     ignored = 0
+    
     for run in os.listdir(folder):
         if run.startswith("install") or run.startswith("prepare"):
             continue
@@ -43,6 +44,7 @@ def fetch_runs(folder, filter):
             date = retrieve_datetime_from_name(date)
         else:
             name = run
+            date = None
 
         if date is None:
             date = datetime.fromtimestamp(os.path.getmtime(pth))
diff --git a/milabench/config.py b/milabench/config.py
index 039a85cc..9a2d519c 100644
--- a/milabench/config.py
+++ b/milabench/config.py
@@ -100,11 +100,15 @@ def combine_args(args, kwargs):
         yield kwargs
     else:
         key, values = args.popitem()
-        for value in values:
-            kwargs[key] = value
+        
+        try:
+            for value in values:
+                kwargs[key] = value
+                yield from combine_args(deepcopy(args), kwargs)
+        except:
+            kwargs[key] = values
             yield from combine_args(deepcopy(args), kwargs)
 
-
 def expand_matrix(name, bench_config):
     if "matrix" not in bench_config:
         return [(name, bench_config)]
diff --git a/milabench/remote.py b/milabench/remote.py
index cbe9696b..c92166fd 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -124,7 +124,6 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
 
     nodes = pack.config["system"]["nodes"]
     copy = []
-    node_packs = []
 
     copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for)
 
@@ -132,7 +131,8 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
 
     for i, node in enumerate(nodes):
         if should_run_for(node, setup_for):
-            install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER))
+            node_pack = worker_pack(pack, node)
+            install.append(pip_install_milabench(node_pack, node, INSTALL_FOLDER))
 
     return SequenceCommand(
         copy_source,
diff --git a/milabench/sizer.py b/milabench/sizer.py
index 75002edb..00d6d2b6 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -53,8 +53,7 @@ def to_octet(value: str) -> float:
 class Sizer:
     """Automatically scale the batch size to match GPU spec"""
 
-    def __init__(self, options=SizerOptions(), scaling_config=None):
-        self.options = options
+    def __init__(self, scaling_config=None):
         self.path = scaling_config
 
         if scaling_config is None:
@@ -62,6 +61,10 @@ def __init__(self, options=SizerOptions(), scaling_config=None):
 
         with open(scaling_config, "r") as sconf:
             self.scaling_config = yaml.safe_load(sconf)
+            
+    @property
+    def options(self):
+        return SizerOptions()
 
     def benchscaling(self, benchmark):
         # key
@@ -165,6 +168,10 @@ def find_batch_size(self, benchmark, event):
         return -1
 
     def argv(self, benchmark, capacity, argv):
+        newargv = self._argv(benchmark, capacity, argv)
+        return newargv
+        
+    def _argv(self, benchmark, capacity, argv):
         """Find the batch size and override it with a new value"""
 
         config = self.benchscaling(benchmark)
@@ -214,11 +221,12 @@ def argv(self, benchmark, capacity, argv):
 
 
 def batch_sizer() -> Sizer:
-    sizer = sizer_global.get()
-    if sizer is None:
-        sizer_global.set(Sizer())
-        return batch_sizer()
-    return sizer
+    return Sizer()
+    # sizer = sizer_global.get()
+    # if sizer is None:
+    #     sizer_global.set(Sizer())
+    #     return batch_sizer()
+    # return sizer
 
 
 def get_batch_size(config, start_event):
@@ -242,8 +250,9 @@ class MemoryUsageExtractor(ValidationLayer):
     """Extract max memory usage per benchmark to populate the memory model"""
 
     def __init__(self):
+        
+        self.filepath = option("sizer.save", str, None)
         sizer = batch_sizer()
-        self.filepath = sizer.options.save
         self.memory = deepcopy(sizer.scaling_config)
         self.scaling = None
         self.benchname = None
diff --git a/milabench/system.py b/milabench/system.py
index 2d5a6ca8..9aa49975 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -1,4 +1,5 @@
 import contextvars
+from copy import deepcopy
 import ipaddress
 import os
 import socket
@@ -15,7 +16,7 @@
 from .merge import merge
 
 system_global = contextvars.ContextVar("system", default=None)
-
+multirun_global = contextvars.ContextVar("multirun", default=None)
 
 def get_gpu_capacity(strict=False):
     try:
@@ -79,6 +80,60 @@ def as_environment_variable(name):
     return "MILABENCH_" + "_".join(map(str.upper, frags))
 
 
+def multirun():
+    multirun = multirun_global.get()
+    
+    if multirun is None or len(multirun) == 0:
+        yield None, dict()
+        
+    runs = multirun.get("runs", dict())
+    
+    from .config import combine_args
+    import time
+    from types import SimpleNamespace
+    
+    def unflatten(dct):
+        result = {}
+        for k, v in dct.items():
+            l = result
+            frags = k.split(".")
+            for frag in frags[:-1]:
+                l = l.setdefault(frag, SimpleNamespace())
+            setattr(l, frags[-1], v)
+            
+        return result
+                
+    for run_matrix in runs:
+        arguments = run_matrix["matrix"]
+
+        for run in combine_args(arguments, dict()):
+            template_name = run_matrix["name"]
+            
+            ctx = unflatten(run)
+            ctx['time'] = int(time.time())
+            run_name = template_name.format(**ctx)
+            
+            yield run_name, run
+
+
+@contextmanager
+def apply_system(config: dict):
+    system = system_global.get()
+    old = deepcopy(system)
+    
+    for k, v in config.items():
+        frags = k.split(".")
+        
+        lookup = system.setdefault("options", {})
+        for f in frags[:-1]:
+            lookup = lookup.setdefault(f, {})
+        lookup[frags[-1]] = v
+        
+
+    yield    
+    system_global.set(old)
+
+
 def option(name, etype, default=None):
     options = dict()
     system = system_global.get()
@@ -401,11 +456,12 @@ def gethostname(host):
 def resolve_hostname(ip):
     try:
         hostname, _, iplist = socket.gethostbyaddr(ip)
-
+        
         for ip in iplist:
             if is_loopback(ip):
                 return hostname, True
 
+        # FIXME
         return socket.gethostname(), hostname.startswith(socket.gethostname())
         return hostname, hostname == socket.gethostname()
 
@@ -465,6 +521,9 @@ def build_system_config(config_file, defaults=None, gpu=True):
         config = merge(defaults, config)
 
     system = config.get("system", {})
+    multirun = config.get("multirun", {})
+    
+    multirun_global.set(multirun)
     system_global.set(system)
 
     # capacity is only required if batch resizer is enabled
diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh
index fbb9da83..0fc2bf16 100644
--- a/scripts/article/run_rocm.sh
+++ b/scripts/article/run_rocm.sh
@@ -11,6 +11,7 @@ export ROCM_PATH="/opt/rocm"
 export MILABENCH_BASE="$MILABENCH_WORDIR/results"
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
 export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
+export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml"
 
 if [ -z "${MILABENCH_SOURCE}" ]; then
     export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
@@ -24,6 +25,17 @@ export TORCH_ROCM_ARCH_LIST="$GPU"
 export ROCM_TARGETS="$GPU"
 export PYTORCH_ROCM_ARCH="$GPU"
 
+if [ -z "${MILABENCH_SOURCE}" ]; then
+    export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
+else
+    export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml"
+fi
+
+
+export GPU="$(/opt/rocm/lib/llvm/bin/amdgpu-arch | head -n 1)"
+export TORCH_ROCM_ARCH_LIST="$GPU"
+export ROCM_TARGETS="$GPU"
+export PYTORCH_ROCM_ARCH="$GPU"
 
 ARGS="$@"
 
@@ -75,7 +87,7 @@ install_prepare() {
 
         # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30
         pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl
-        pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz
+        pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.g
 
         pip uninstall torch_cluster torch_scatter torch_sparse -y
         FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git
@@ -111,12 +123,19 @@ else
     . $MILABENCH_WORDIR/env/bin/activate
 fi
 
+(
+    . $BENCHMARK_VENV/bin/activate
+    pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1
+)
+
+# milabench install $ARGS --system $MILABENCH_WORDIR/system.yaml
 
-milabench prepare $ARGS 
+# milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml
 
 #
 #   Run the benchmakrs
-milabench run $ARGS 
+milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml
+
 
 #
 #   Display report
diff --git a/tests/test_system_matrix.py b/tests/test_system_matrix.py
new file mode 100644
index 00000000..ed537881
--- /dev/null
+++ b/tests/test_system_matrix.py
@@ -0,0 +1,40 @@
+
+
+
+
+
+from milabench.system import multirun, build_system_config, enable_offline, option, apply_system, SizerOptions
+
+from milabench.testing import official_config
+
+
+def test_system_matrix():
+    with enable_offline(True):
+        sys = build_system_config(official_config("examples/system"))
+        
+        n = 0
+        for name, conf in multirun():
+            print(name, conf)
+            n += 1
+
+        assert n == 39
+
+
+def test_apply_system_matrix():
+    with enable_offline(True):
+        sys = build_system_config(official_config("examples/system"))
+
+        for name, conf in multirun():
+            with apply_system(conf):
+                
+                # Apply system worked and changed the config
+                for k, v in conf.items():
+                    assert option(k, lambda x: x) == v
+
+
+                assert SizerOptions().save == option("sizer.save", lambda x: x)
+
+    
+    
+if __name__ == "__main__":
+    test_apply_system_matrix()

From deb271ab4ad39f67938a824d00df717b3029d7f0 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 14:19:22 -0500
Subject: [PATCH 12/20] Add monitor tag to templates

---
 benchmarks/_templates/simple/dev.yaml | 2 ++
 benchmarks/_templates/stdout/dev.yaml | 3 ++-
 benchmarks/_templates/voir/dev.yaml   | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/_templates/simple/dev.yaml b/benchmarks/_templates/simple/dev.yaml
index e3aa9467..affcc977 100644
--- a/benchmarks/_templates/simple/dev.yaml
+++ b/benchmarks/_templates/simple/dev.yaml
@@ -6,3 +6,5 @@ template:
   install_group: torch
   plan:
     method: per_gpu
+  tags:
+    - monogpu
diff --git a/benchmarks/_templates/stdout/dev.yaml b/benchmarks/_templates/stdout/dev.yaml
index 2b7e75a3..24c7b813 100644
--- a/benchmarks/_templates/stdout/dev.yaml
+++ b/benchmarks/_templates/stdout/dev.yaml
@@ -3,7 +3,8 @@ _template:
   definition: .
   install-variant: unpinned
   install_group: torch
-
+  tags:
+    - monogpu
   #argv:
   #  --train_batch_size: 32
   #  --num_epochs: 5
diff --git a/benchmarks/_templates/voir/dev.yaml b/benchmarks/_templates/voir/dev.yaml
index e3aa9467..affcc977 100644
--- a/benchmarks/_templates/voir/dev.yaml
+++ b/benchmarks/_templates/voir/dev.yaml
@@ -6,3 +6,5 @@ template:
   install_group: torch
   plan:
     method: per_gpu
+  tags:
+    - monogpu

From a8415d3da9f91aa1ac23d932dff2c70fe580e556 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 14:35:55 -0500
Subject: [PATCH 13/20] Do not rely on DNS to resolve nodes

---
 milabench/remote.py |  2 +-
 milabench/system.py | 71 ++++++++-------------------------------------
 2 files changed, 13 insertions(+), 60 deletions(-)

diff --git a/milabench/remote.py b/milabench/remote.py
index c92166fd..27660f75 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -192,7 +192,7 @@ def is_remote(pack):
 def is_main_local(pack):
     """Only the local main can send remote commands to remote"""
     self = pack.config["system"]["self"]
-    return self is not None and self["local"] and self.get("main", False)
+    return self is not None and self.get("local", True) and self.get("main", False)
 
 
 def is_worker(pack):
diff --git a/milabench/system.py b/milabench/system.py
index 9aa49975..bd8298c6 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -323,6 +323,7 @@ def get_remote_ip():
 
     for interface, address_list in addresses.items():
         for address in address_list:
+            # if address.family in (socket.AF_INET, socket.AF_INET6):
             if interface in stats and getattr(stats[interface], "isup"):
                 result.append(address.address)
 
@@ -341,46 +342,6 @@ def is_loopback(address: str) -> bool:
 
 
 
-def _resolve_ip(ip):
-    hostname = ip
-    aliaslist = []
-    ipaddrlist = [ip]
-    lazy_raise = None
-
-    if not offline:
-        # Resolve the IP
-        try:
-            hostname, aliaslist, ipaddrlist = socket.gethostbyaddr(ip)
-            lazy_raise = None
-        
-        except socket.herror as err:
-            lazy_raise = err
-
-        except socket.gaierror as err:
-            # Get Addr Info (GAI) Error
-            #
-            # When we are connecting to a node through a ssh proxy jump
-            # the node IPs/Hostnames are not available until we reach
-            # the first node inside the cluster
-            #
-            lazy_raise = err
-
-    return hostname, aliaslist, ipaddrlist, lazy_raise
-
-
-def _fix_weird(hostname):
-    if hostname.endswith(".server.mila.quebec.server.mila.quebec"):
-        print()
-        print("Hostname was extra long for no reason")
-        print(hostname, socket.gethostname())
-        print()
-
-        # why is this happening
-        hostname = hostname[: -len(".server.mila.quebec")]
-    
-    return hostname
-
-
 # If true that means we cannot resolve the ip addresses
 # so we ignore errors
 offline = True
@@ -406,29 +367,21 @@ def _resolve_addresses(nodes):
     ip_list = get_remote_ip()
 
     for node in nodes:
-        hostname, aliaslist, ipaddrlist, lazy_raise = _resolve_ip(node["ip"])
-
-        hostname = _fix_weird(hostname)
-
-        node["hostname"] = hostname
-        node["aliaslist"] = aliaslist
-        node["ipaddrlist"] = ipaddrlist
-
-        is_local = (
-            ("127.0.0.1" in ipaddrlist)
-            or (hostname in ("localhost", socket.gethostname(), "127.0.0.1"))
-            or (socket.gethostname().startswith(hostname))
-            or len(ip_list.intersection(ipaddrlist)) > 0
-            or any([is_loopback(ip) for ip in ipaddrlist])
-        )
-
-        # cn-g005 cn-g005.server.mila.quebec
-        # print(hostname, socket.gethostname())
+        ip = node["ip"]
+        
+        is_local = is_loopback(ip)
+        
+        if ip in ip_list:
+            is_local = True            
+        
         node["local"] = is_local
+        
+        if is_local:
+            node["hostname"] = socket.gethostname()
 
         if is_local and self is None:
             self = node
-            node["ipaddrlist"] = list(set(list(ip_list) + list(ipaddrlist)))
+            node["ipaddrlist"] = list(set(list(ip_list)))
 
     # if self is node we might be outisde the cluster
     # which explains why we could not resolve the IP of the nodes

From d5cbbf50dfffd05ccda970f6c3e12cd2599bf339 Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 17:07:45 -0500
Subject: [PATCH 14/20] Update README

---
 README.md                                     | 143 +++--
 milabench/_version.py                         |   6 +-
 scripts/article/run_cuda.sh                   |   9 +-
 .../test_command_reg_one_node.txt             | 604 -----------------
 .../test_command_reg_two_nodes.txt            | 607 ------------------
 .../test_capabilities.py                      |   0
 6 files changed, 90 insertions(+), 1279 deletions(-)
 delete mode 100644 tests/test_command_reg/test_command_reg_one_node.txt
 delete mode 100644 tests/test_command_reg/test_command_reg_two_nodes.txt
 rename tests/{ => test_validation}/test_capabilities.py (100%)

diff --git a/README.md b/README.md
index 52639893..163906d0 100644
--- a/README.md
+++ b/README.md
@@ -20,62 +20,23 @@ evaluating current and future hardware in a research environment.
 * Focussed on training
 * Ease of use
 * Pytorch focused
-* ROCm & NVIDIA
+* ROCm, NVIDIA, Intel OneAPI, Habana Gaudi (Synapse)
 * Independent 
 
 ## Getting Started
 
-The easiest way to run milabbench is to run it with one of its docker image.
-It will include all of the necessary data
-
-
-    # Choose the image you want to use
-    export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly
-
-    # Pull the image we are going to run
-    docker pull $MILABENCH_IMAGE
-
-    # Run milabench
-    docker run -it --rm --ipc=host --gpus=all      \
-          -v $(pwd)/results:/milabench/envs/runs   \
-          $MILABENCH_IMAGE                         \
-          bash -c "milabench prepare && milabench run"
-
-    =================
-    Benchmark results
-    =================
-                             fail n       perf   sem%   std% peak_memory          score weight
-    bert-fp16                   0 8     155.08   0.3%   4.3%       24552    1241.260310   0.00
-    bert-fp32                   0 8      29.52   0.0%   0.5%       31524     236.337218   0.00
-    bert-tf32                   0 8     120.46   0.4%   6.1%       31524     964.713297   0.00
-    bert-tf32-fp16              0 8     154.76   0.3%   4.1%       24552    1238.477257   3.00
-    convnext_large-fp16         0 8     337.48   0.9%  14.0%       27658    2741.604444   0.00
-    convnext_large-fp32         0 8      44.61   0.8%  12.6%       49786     354.207225   0.00
-    convnext_large-tf32         0 8     135.99   0.7%  11.2%       49786    1089.394916   0.00
-    convnext_large-tf32-fp16    0 8     338.58   0.8%  13.0%       27658    2744.325170   3.00
-    davit_large                 0 8     312.79   0.3%   6.7%       35058    2515.326450   1.00
-    davit_large-multi           0 1    2401.65   1.0%   7.7%       42232    2401.651720   5.00
-    dlrm                        0 1  188777.20   1.8%  14.0%        3194  188777.203190   1.00
-    focalnet                    0 8     400.47   0.2%   5.4%       26604    3215.431924   2.00
-    opt-1_3b                    0 1      26.71   0.1%   0.4%       44116      26.714365   5.00
-    opt-1_3b-multinode          0 2      34.62   0.2%   1.0%       43552      34.618292  10.00
-    opt-6_7b                    0 1      14.32   0.0%   0.1%       55750      14.319587   5.00
-    opt-6_7b-multinode          0 2      10.79   0.1%   0.7%       49380      10.792595  10.00
-    reformer                    0 8      61.70   0.0%   0.9%       25376     494.110834   1.00
-    regnet_y_128gf              0 8      99.96   0.2%   5.0%       31840     803.012507   2.00
-    resnet152                   0 8     710.18   0.3%   6.2%       36732    5710.828608   1.00
-    resnet152-multi             0 1    5367.34   1.0%   8.1%       38638    5367.338469   5.00
-    resnet50                    0 8     984.43   0.9%  19.1%        5026    7927.257351   1.00
-    rwkv                        0 8     428.65   0.2%   3.8%        5546    3435.097716   1.00
-    stargan                     0 8      51.32   1.8%  40.8%       37848     413.238870   1.00
-    super-slomo                 0 8      41.63   0.1%   2.3%       34082     332.395065   1.00
-    t5                          0 8      48.05   0.2%   3.9%       35466     384.317023   2.00
-    whisper                     0 8     248.16   0.0%   0.6%       37006    1985.861017   1.00
-    
-    Scores
-    ------
-    Failure rate:       0.00% (PASS)
-    Score:             219.06
+
+  git clone https://github.com/mila-iqia/milabench.git
+  
+  pip install -e milabench
+
+  export MILABENCH_GPU_ARCH=cuda
+
+  milabench install --base workspace --config milabench/config/standard.yaml --select fp32
+  
+  milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32
+  
+  milabench run --base workspace --config milabench/config/standard.yaml --select fp32
 
 
 ## Details
@@ -84,13 +45,77 @@ The benchmark suite has been validated on the following configurations:
 
 | Python version |          GPU                   |   Configuration file |
 |       -        |        -                       |           -          |
-| 3.10   (conda) | 2 node x 8xNVIDIA A100 80GB    | config/standard.yaml |
-| 3.9.12 (conda) | 8x NVIDIA RTX8000 48GB         | config/standard.yaml |
-| 3.9.16 (conda) | 2x NVIDIA K80                  | config/ci.yaml       |
-| 3.9.16 (conda) | 2x AMD MI100                   | config/ci.yaml       |
-| 3.9.16 (conda) | 4x AMD MI250                   | config/standard.yaml |
+| 3.10           | 2 node x 8xNVIDIA A100 80GB    | config/standard.yaml |
+| 3.10           | 2 node x 8xMI300X              | config/standard.yaml |
+| 3.10           | 1 node x 8xGaudi2              | config/standard.yaml |
 
 We are working on validating it on more configurations and will update the above table as we do.
 
-
-
+## Report
+
+  =================
+  Benchmark results
+  =================
+
+  System
+  ------
+  cpu:      AMD EPYC 7742 64-Core Processor
+  n_cpu:    128
+  product:  NVIDIA A100-SXM4-80GB
+  n_gpu:    8
+  memory:   81920.0
+
+  Breakdown
+  ---------
+  bench                    | fail |   n | ngpu |           perf |   sem% |   std% | peak_memory |           score | weight
+  brax                     |    0 |   1 |    8 |      730035.71 |   0.1% |   0.4% |        2670 |       730035.71 |   1.00
+  diffusion-gpus           |    0 |   1 |    8 |         117.67 |   1.5% |  11.7% |       59944 |          117.67 |   1.00
+  diffusion-single         |    0 |   8 |    1 |          25.02 |   0.8% |  17.9% |       53994 |          202.10 |   1.00
+  dimenet                  |    0 |   8 |    1 |         366.85 |   0.7% |  16.2% |        2302 |         2973.32 |   1.00
+  dinov2-giant-gpus        |    0 |   1 |    8 |         445.68 |   0.4% |   3.0% |       69614 |          445.68 |   1.00
+  dinov2-giant-single      |    0 |   8 |    1 |          53.54 |   0.4% |   9.5% |       74646 |          432.65 |   1.00
+  dqn                      |    0 |   8 |    1 | 23089954554.91 |   1.1% |  89.9% |       62106 | 184480810548.20 |   1.00
+  bf16                     |    0 |   8 |    1 |         293.43 |   0.2% |   6.3% |        1788 |         2361.16 |   0.00
+  fp16                     |    0 |   8 |    1 |         289.26 |   0.1% |   3.6% |        1788 |         2321.65 |   0.00
+  fp32                     |    0 |   8 |    1 |          19.14 |   0.0% |   0.7% |        2166 |          153.21 |   0.00
+  tf32                     |    0 |   8 |    1 |         146.63 |   0.1% |   3.6% |        2166 |         1177.04 |   0.00
+  bert-fp16                |    0 |   8 |    1 |         263.73 |   1.1% |  16.7% |         nan |         2165.37 |   0.00
+  bert-fp32                |    0 |   8 |    1 |          44.84 |   0.6% |   9.6% |       21170 |          364.52 |   0.00
+  bert-tf32                |    0 |   8 |    1 |         141.95 |   0.9% |  14.1% |        1764 |         1162.94 |   0.00
+  bert-tf32-fp16           |    0 |   8 |    1 |         265.04 |   1.0% |  15.6% |         nan |         2175.59 |   3.00
+  reformer                 |    0 |   8 |    1 |          62.29 |   0.3% |   6.0% |       25404 |          501.89 |   1.00
+  t5                       |    0 |   8 |    1 |          51.40 |   0.5% |   9.9% |       34390 |          416.14 |   2.00
+  whisper                  |    0 |   8 |    1 |         481.95 |   1.0% |  21.4% |        8520 |         3897.53 |   1.00
+  lightning                |    0 |   8 |    1 |         680.22 |   1.0% |  22.7% |       27360 |         5506.90 |   1.00
+  lightning-gpus           |    0 |   1 |    8 |        3504.74 |   7.9% |  62.9% |       28184 |         3504.74 |   1.00
+  llava-single             |    1 |   8 |    1 |           2.28 |   0.4% |   9.6% |       72556 |           14.12 |   1.00
+  llama                    |    0 |   8 |    1 |         484.86 |   4.4% |  80.0% |       27820 |         3680.86 |   1.00
+  llm-full-mp-gpus         |    0 |   1 |    8 |         193.92 |   3.1% |  16.2% |       48470 |          193.92 |   1.00
+  llm-lora-ddp-gpus        |    0 |   1 |    8 |       16738.58 |   0.4% |   2.0% |       36988 |        16738.58 |   1.00
+  llm-lora-mp-gpus         |    0 |   1 |    8 |        1980.63 |   2.2% |  11.8% |       55972 |         1980.63 |   1.00
+  llm-lora-single          |    0 |   8 |    1 |        2724.95 |   0.2% |   3.0% |       49926 |        21861.99 |   1.00
+  ppo                      |    0 |   8 |    1 |     3114264.32 |   1.6% |  57.2% |       62206 |     24915954.98 |   1.00
+  recursiongfn             |    0 |   8 |    1 |        7080.67 |   1.2% |  27.1% |       10292 |        57038.34 |   1.00
+  rlhf-gpus                |    0 |   1 |    8 |        6314.94 |   2.1% |  11.2% |       21730 |         6314.94 |   1.00
+  rlhf-single              |    0 |   8 |    1 |        1143.72 |   0.4% |   8.4% |       19566 |         9174.52 |   1.00
+  focalnet                 |    0 |   8 |    1 |         375.07 |   0.7% |  14.9% |       23536 |         3038.83 |   2.00
+  torchatari               |    0 |   8 |    1 |        5848.88 |   0.6% |  12.7% |        3834 |        46613.34 |   1.00
+  convnext_large-fp16      |    0 |   8 |    1 |         330.93 |   1.5% |  22.9% |       27376 |         2711.46 |   0.00
+  convnext_large-fp32      |    0 |   8 |    1 |          59.49 |   0.6% |   9.8% |       55950 |          483.84 |   0.00
+  convnext_large-tf32      |    0 |   8 |    1 |         155.41 |   0.9% |  14.3% |       49650 |         1273.31 |   0.00
+  convnext_large-tf32-fp16 |    0 |   8 |    1 |         322.28 |   1.6% |  24.5% |       27376 |         2637.88 |   3.00
+  regnet_y_128gf           |    0 |   8 |    1 |         119.46 |   0.5% |  10.0% |       29762 |          966.96 |   2.00
+  resnet152-ddp-gpus       |    0 |   1 |    8 |        3843.06 |   5.2% |  39.3% |       27980 |         3843.06 |   0.00
+  resnet50                 |    0 |   8 |    1 |         932.95 |   2.4% |  52.2% |       14848 |         7524.25 |   1.00
+  resnet50-noio            |    0 |   8 |    1 |        1163.88 |   0.3% |   6.7% |       27480 |         9385.35 |   0.00
+  vjepa-gpus               |    0 |   1 |    8 |         130.13 |   5.9% |  46.8% |       64244 |          130.13 |   1.00
+  vjepa-single             |    0 |   8 |    1 |          21.29 |   1.0% |  22.4% |       58552 |          172.11 |   1.00
+
+  Scores
+  ------
+  Failure rate:       0.38% (PASS)
+  Score:            4175.57
+
+  Errors
+  ------
+  1 errors, details in HTML report.
\ No newline at end of file
diff --git a/milabench/_version.py b/milabench/_version.py
index a3f4e1b4..281e1d0a 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-18-g784b38e"
-__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179"
-__date__ = "2024-10-18 15:58:46 +0000"
+__tag__ = "v0.1.0-146-ga8415d3"
+__commit__ = "a8415d3da9f91aa1ac23d932dff2c70fe580e556"
+__date__ = "2024-11-21 14:35:55 -0500"
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index 0c2c1dae..9ef13b7d 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -84,15 +84,12 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
 
     . $MILABENCH_WORDIR/env/bin/activate
 
-
-
     # pip install torch
     # milabench pin --variant cuda --from-scratch 
     # rm -rf $MILABENCH_WORDIR/results/venv/
-    rm -rf $MILABENCH_WORDIR/results/extra
-    
-    milabench install --system $MILABENCH_WORDIR/system.yaml
-    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
+    # rm -rf $MILABENCH_WORDIR/results/extra
+    # milabench install --system $MILABENCH_WORDIR/system.yaml
+    # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     (
         . $BENCHMARK_VENV/bin/activate
diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt
deleted file mode 100644
index 3a511bb6..00000000
--- a/tests/test_command_reg/test_command_reg_one_node.txt
+++ /dev/null
@@ -1,604 +0,0 @@
-#!/bin/sh
-
-echo "---"
-echo "Virtual Env"
-echo "==========="
-export VIRTUAL_ENV=$BASE/venv/torch
-
-source $VIRTUAL_ENV/bin/activate
-echo "---"
-echo "Milabench"
-echo "========="
-export MILABENCH_DIR_BASE=$BASE
-export MILABENCH_DIR_VENV=$BASE/venv/torch
-export MILABENCH_DIR_DATA=$BASE/data
-export MILABENCH_DIR_RUNS=$BASE/runs
-export MILABENCH_DIR_EXTRA=$BASE/extra/llm
-export MILABENCH_DIR_CACHE=$BASE/cache
-export OMP_NUM_THREADS=0
-export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
-
-echo "---"
-echo "llama"
-echo "====="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "fp16"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  wait
-)
-
-echo "---"
-echo "bf16"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  wait
-)
-
-echo "---"
-echo "tf32"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  wait
-)
-
-echo "---"
-echo "fp32"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  wait
-)
-
-echo "---"
-echo "resnet50"
-echo "========"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "resnet50-noio"
-echo "============="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "resnet152-ddp-gpus"
-echo "=================="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-fp32"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-fp16"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-tf32"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-tf32-fp16"
-echo "========================"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "regnet_y_128gf"
-echo "=============="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  wait
-)
-
-echo "---"
-echo "bert-fp32"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "bert-fp16"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "bert-tf32"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "bert-tf32-fp16"
-echo "=============="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "t5"
-echo "=="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  wait
-)
-
-echo "---"
-echo "reformer"
-echo "========"
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  wait
-)
-
-echo "---"
-echo "whisper"
-echo "======="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  wait
-)
-
-echo "---"
-echo "focalnet"
-echo "========"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 &
-  wait
-)
-
-echo "---"
-echo "brax"
-echo "===="
-time (
-  python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 &
-  wait
-)
-
-echo "---"
-echo "diffusion-single"
-echo "================"
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "diffusion-gpus"
-echo "=============="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "diffusion-nodes"
-echo "==============="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "lightning"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "lightning-gpus"
-echo "=============="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "dinov2-giant-single"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  wait
-)
-
-echo "---"
-echo "dinov2-giant-gpus"
-echo "================="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-single"
-echo "==============="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-ddp-gpus"
-echo "================="
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-ddp-nodes"
-echo "=================="
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-mp-gpus"
-echo "================"
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
-  wait
-)
-
-echo "---"
-echo "llm-full-mp-gpus"
-echo "================"
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
-  wait
-)
-
-echo "---"
-echo "llm-full-mp-nodes"
-echo "================="
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
-  wait
-)
-
-echo "---"
-echo "dqn"
-echo "==="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  wait
-)
-
-echo "---"
-echo "ppo"
-echo "==="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  wait
-)
-
-echo "---"
-echo "dimenet"
-echo "======="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  wait
-)
-
-echo "---"
-echo "recursiongfn"
-echo "============"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  wait
-)
-
-echo "---"
-echo "torchatari"
-echo "=========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  wait
-)
-
-echo "---"
-echo "llava-single"
-echo "============"
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  wait
-)
-
-echo "---"
-echo "rlhf-single"
-echo "==========="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  wait
-)
-
-echo "---"
-echo "rlhf-gpus"
-echo "========="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  wait
-)
-
-echo "---"
-echo "vjepa-single"
-echo "============"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  wait
-)
-
-echo "---"
-echo "vjepa-gpus"
-echo "=========="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
-  wait
-)
-
diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt
deleted file mode 100644
index 3004505d..00000000
--- a/tests/test_command_reg/test_command_reg_two_nodes.txt
+++ /dev/null
@@ -1,607 +0,0 @@
-#!/bin/sh
-
-echo "---"
-echo "Virtual Env"
-echo "==========="
-export VIRTUAL_ENV=$BASE/venv/torch
-
-source $VIRTUAL_ENV/bin/activate
-echo "---"
-echo "Milabench"
-echo "========="
-export MILABENCH_DIR_BASE=$BASE
-export MILABENCH_DIR_VENV=$BASE/venv/torch
-export MILABENCH_DIR_DATA=$BASE/data
-export MILABENCH_DIR_RUNS=$BASE/runs
-export MILABENCH_DIR_EXTRA=$BASE/extra/llm
-export MILABENCH_DIR_CACHE=$BASE/cache
-export OMP_NUM_THREADS=0
-export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
-
-echo "---"
-echo "llama"
-echo "====="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "fp16"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
-  wait
-)
-
-echo "---"
-echo "bf16"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
-  wait
-)
-
-echo "---"
-echo "tf32"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
-  wait
-)
-
-echo "---"
-echo "fp32"
-echo "===="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
-  wait
-)
-
-echo "---"
-echo "resnet50"
-echo "========"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "resnet50-noio"
-echo "============="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "resnet152-ddp-gpus"
-echo "=================="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-fp32"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-fp16"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-tf32"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "convnext_large-tf32-fp16"
-echo "========================"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
-  wait
-)
-
-echo "---"
-echo "regnet_y_128gf"
-echo "=============="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
-  wait
-)
-
-echo "---"
-echo "bert-fp32"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "bert-fp16"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "bert-tf32"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "bert-tf32-fp16"
-echo "=============="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
-  wait
-)
-
-echo "---"
-echo "t5"
-echo "=="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
-  wait
-)
-
-echo "---"
-echo "reformer"
-echo "========"
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 &
-  wait
-)
-
-echo "---"
-echo "whisper"
-echo "======="
-time (
-  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
-  wait
-)
-
-echo "---"
-echo "focalnet"
-echo "========"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 &
-  wait
-)
-
-echo "---"
-echo "brax"
-echo "===="
-time (
-  python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 &
-  wait
-)
-
-echo "---"
-echo "diffusion-single"
-echo "================"
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "diffusion-gpus"
-echo "=============="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "diffusion-nodes"
-echo "==============="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
-  wait
-)
-
-echo "---"
-echo "lightning"
-echo "========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "lightning-gpus"
-echo "=============="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
-  wait
-)
-
-echo "---"
-echo "dinov2-giant-single"
-echo "==================="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  wait
-)
-
-echo "---"
-echo "dinov2-giant-gpus"
-echo "================="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-single"
-echo "==============="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-ddp-gpus"
-echo "================="
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-ddp-nodes"
-echo "=================="
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  wait
-)
-
-echo "---"
-echo "llm-lora-mp-gpus"
-echo "================"
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
-  wait
-)
-
-echo "---"
-echo "llm-full-mp-gpus"
-echo "================"
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
-  wait
-)
-
-echo "---"
-echo "llm-full-mp-nodes"
-echo "================="
-time (
-  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
-  wait
-)
-
-echo "---"
-echo "dqn"
-echo "==="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 &
-  wait
-)
-
-echo "---"
-echo "ppo"
-echo "==="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 &
-  wait
-)
-
-echo "---"
-echo "dimenet"
-echo "======="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d &
-  wait
-)
-
-echo "---"
-echo "recursiongfn"
-echo "============"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
-  wait
-)
-
-echo "---"
-echo "torchatari"
-echo "=========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
-  wait
-)
-
-echo "---"
-echo "llava-single"
-echo "============"
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
-  wait
-)
-
-echo "---"
-echo "rlhf-single"
-echo "==========="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  wait
-)
-
-echo "---"
-echo "rlhf-gpus"
-echo "========="
-time (
-  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  wait
-)
-
-echo "---"
-echo "vjepa-single"
-echo "============"
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
-  wait
-)
-
-echo "---"
-echo "vjepa-gpus"
-echo "=========="
-time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
-  wait
-)
-
diff --git a/tests/test_capabilities.py b/tests/test_validation/test_capabilities.py
similarity index 100%
rename from tests/test_capabilities.py
rename to tests/test_validation/test_capabilities.py

From 9a29f06d689482cb3c1b72d044403d0866f31e7f Mon Sep 17 00:00:00 2001
From: Pierre Delaunay <pierre@delaunay.io>
Date: Thu, 21 Nov 2024 17:12:15 -0500
Subject: [PATCH 15/20] undo gaudi2 config for llm

---
 benchmarks/llm/configs/llama3_70B_full.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml
index 22b52b79..703eb876 100644
--- a/benchmarks/llm/configs/llama3_70B_full.yaml
+++ b/benchmarks/llm/configs/llama3_70B_full.yaml
@@ -82,7 +82,7 @@ optimizer:
   foreach: False
   # Note: highly recommended to use fused=True optimizer flag
   # with CPU offload for faster optimizer step.
-  fused: False
+  fused: true
 
 loss:
   _component_: torch.nn.CrossEntropyLoss
@@ -94,9 +94,9 @@ gradient_accumulation_steps: 1
 device: cuda
 
 # Memory management
-enable_activation_checkpointing: false
-memory_efficient_fsdp_wrap: false
-fsdp_cpu_offload: false
+enable_activation_checkpointing: true
+memory_efficient_fsdp_wrap: true
+fsdp_cpu_offload: true
 
 # Reduced precision
 dtype: bf16

From 06fa1be26201f04bb2281ecd86e70da7e3522623 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 19:44:48 -0500
Subject: [PATCH 16/20] Update README.md

---
 README.md | 152 +++++++++++++++++++++++++++---------------------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/README.md b/README.md
index 163906d0..a2f8ce50 100644
--- a/README.md
+++ b/README.md
@@ -25,18 +25,18 @@ evaluating current and future hardware in a research environment.
 
 ## Getting Started
 
-
-  git clone https://github.com/mila-iqia/milabench.git
   
-  pip install -e milabench
-
-  export MILABENCH_GPU_ARCH=cuda
-
-  milabench install --base workspace --config milabench/config/standard.yaml --select fp32
+    git clone https://github.com/mila-iqia/milabench.git
+    
+    pip install -e milabench
   
-  milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32
+    export MILABENCH_GPU_ARCH=cuda
   
-  milabench run --base workspace --config milabench/config/standard.yaml --select fp32
+    milabench install --base workspace --config milabench/config/standard.yaml --select fp32
+    
+    milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32
+    
+    milabench run --base workspace --config milabench/config/standard.yaml --select fp32
 
 
 ## Details
@@ -52,70 +52,70 @@ The benchmark suite has been validated on the following configurations:
 We are working on validating it on more configurations and will update the above table as we do.
 
 ## Report
-
-  =================
-  Benchmark results
-  =================
-
-  System
-  ------
-  cpu:      AMD EPYC 7742 64-Core Processor
-  n_cpu:    128
-  product:  NVIDIA A100-SXM4-80GB
-  n_gpu:    8
-  memory:   81920.0
-
-  Breakdown
-  ---------
-  bench                    | fail |   n | ngpu |           perf |   sem% |   std% | peak_memory |           score | weight
-  brax                     |    0 |   1 |    8 |      730035.71 |   0.1% |   0.4% |        2670 |       730035.71 |   1.00
-  diffusion-gpus           |    0 |   1 |    8 |         117.67 |   1.5% |  11.7% |       59944 |          117.67 |   1.00
-  diffusion-single         |    0 |   8 |    1 |          25.02 |   0.8% |  17.9% |       53994 |          202.10 |   1.00
-  dimenet                  |    0 |   8 |    1 |         366.85 |   0.7% |  16.2% |        2302 |         2973.32 |   1.00
-  dinov2-giant-gpus        |    0 |   1 |    8 |         445.68 |   0.4% |   3.0% |       69614 |          445.68 |   1.00
-  dinov2-giant-single      |    0 |   8 |    1 |          53.54 |   0.4% |   9.5% |       74646 |          432.65 |   1.00
-  dqn                      |    0 |   8 |    1 | 23089954554.91 |   1.1% |  89.9% |       62106 | 184480810548.20 |   1.00
-  bf16                     |    0 |   8 |    1 |         293.43 |   0.2% |   6.3% |        1788 |         2361.16 |   0.00
-  fp16                     |    0 |   8 |    1 |         289.26 |   0.1% |   3.6% |        1788 |         2321.65 |   0.00
-  fp32                     |    0 |   8 |    1 |          19.14 |   0.0% |   0.7% |        2166 |          153.21 |   0.00
-  tf32                     |    0 |   8 |    1 |         146.63 |   0.1% |   3.6% |        2166 |         1177.04 |   0.00
-  bert-fp16                |    0 |   8 |    1 |         263.73 |   1.1% |  16.7% |         nan |         2165.37 |   0.00
-  bert-fp32                |    0 |   8 |    1 |          44.84 |   0.6% |   9.6% |       21170 |          364.52 |   0.00
-  bert-tf32                |    0 |   8 |    1 |         141.95 |   0.9% |  14.1% |        1764 |         1162.94 |   0.00
-  bert-tf32-fp16           |    0 |   8 |    1 |         265.04 |   1.0% |  15.6% |         nan |         2175.59 |   3.00
-  reformer                 |    0 |   8 |    1 |          62.29 |   0.3% |   6.0% |       25404 |          501.89 |   1.00
-  t5                       |    0 |   8 |    1 |          51.40 |   0.5% |   9.9% |       34390 |          416.14 |   2.00
-  whisper                  |    0 |   8 |    1 |         481.95 |   1.0% |  21.4% |        8520 |         3897.53 |   1.00
-  lightning                |    0 |   8 |    1 |         680.22 |   1.0% |  22.7% |       27360 |         5506.90 |   1.00
-  lightning-gpus           |    0 |   1 |    8 |        3504.74 |   7.9% |  62.9% |       28184 |         3504.74 |   1.00
-  llava-single             |    1 |   8 |    1 |           2.28 |   0.4% |   9.6% |       72556 |           14.12 |   1.00
-  llama                    |    0 |   8 |    1 |         484.86 |   4.4% |  80.0% |       27820 |         3680.86 |   1.00
-  llm-full-mp-gpus         |    0 |   1 |    8 |         193.92 |   3.1% |  16.2% |       48470 |          193.92 |   1.00
-  llm-lora-ddp-gpus        |    0 |   1 |    8 |       16738.58 |   0.4% |   2.0% |       36988 |        16738.58 |   1.00
-  llm-lora-mp-gpus         |    0 |   1 |    8 |        1980.63 |   2.2% |  11.8% |       55972 |         1980.63 |   1.00
-  llm-lora-single          |    0 |   8 |    1 |        2724.95 |   0.2% |   3.0% |       49926 |        21861.99 |   1.00
-  ppo                      |    0 |   8 |    1 |     3114264.32 |   1.6% |  57.2% |       62206 |     24915954.98 |   1.00
-  recursiongfn             |    0 |   8 |    1 |        7080.67 |   1.2% |  27.1% |       10292 |        57038.34 |   1.00
-  rlhf-gpus                |    0 |   1 |    8 |        6314.94 |   2.1% |  11.2% |       21730 |         6314.94 |   1.00
-  rlhf-single              |    0 |   8 |    1 |        1143.72 |   0.4% |   8.4% |       19566 |         9174.52 |   1.00
-  focalnet                 |    0 |   8 |    1 |         375.07 |   0.7% |  14.9% |       23536 |         3038.83 |   2.00
-  torchatari               |    0 |   8 |    1 |        5848.88 |   0.6% |  12.7% |        3834 |        46613.34 |   1.00
-  convnext_large-fp16      |    0 |   8 |    1 |         330.93 |   1.5% |  22.9% |       27376 |         2711.46 |   0.00
-  convnext_large-fp32      |    0 |   8 |    1 |          59.49 |   0.6% |   9.8% |       55950 |          483.84 |   0.00
-  convnext_large-tf32      |    0 |   8 |    1 |         155.41 |   0.9% |  14.3% |       49650 |         1273.31 |   0.00
-  convnext_large-tf32-fp16 |    0 |   8 |    1 |         322.28 |   1.6% |  24.5% |       27376 |         2637.88 |   3.00
-  regnet_y_128gf           |    0 |   8 |    1 |         119.46 |   0.5% |  10.0% |       29762 |          966.96 |   2.00
-  resnet152-ddp-gpus       |    0 |   1 |    8 |        3843.06 |   5.2% |  39.3% |       27980 |         3843.06 |   0.00
-  resnet50                 |    0 |   8 |    1 |         932.95 |   2.4% |  52.2% |       14848 |         7524.25 |   1.00
-  resnet50-noio            |    0 |   8 |    1 |        1163.88 |   0.3% |   6.7% |       27480 |         9385.35 |   0.00
-  vjepa-gpus               |    0 |   1 |    8 |         130.13 |   5.9% |  46.8% |       64244 |          130.13 |   1.00
-  vjepa-single             |    0 |   8 |    1 |          21.29 |   1.0% |  22.4% |       58552 |          172.11 |   1.00
-
-  Scores
-  ------
-  Failure rate:       0.38% (PASS)
-  Score:            4175.57
-
-  Errors
-  ------
-  1 errors, details in HTML report.
\ No newline at end of file
+  
+    =================
+    Benchmark results
+    =================
+  
+    System
+    ------
+    cpu:      AMD EPYC 7742 64-Core Processor
+    n_cpu:    128
+    product:  NVIDIA A100-SXM4-80GB
+    n_gpu:    8
+    memory:   81920.0
+  
+    Breakdown
+    ---------
+    bench                    | fail |   n | ngpu |           perf |   sem% |   std% | peak_memory |           score | weight
+    brax                     |    0 |   1 |    8 |      730035.71 |   0.1% |   0.4% |        2670 |       730035.71 |   1.00
+    diffusion-gpus           |    0 |   1 |    8 |         117.67 |   1.5% |  11.7% |       59944 |          117.67 |   1.00
+    diffusion-single         |    0 |   8 |    1 |          25.02 |   0.8% |  17.9% |       53994 |          202.10 |   1.00
+    dimenet                  |    0 |   8 |    1 |         366.85 |   0.7% |  16.2% |        2302 |         2973.32 |   1.00
+    dinov2-giant-gpus        |    0 |   1 |    8 |         445.68 |   0.4% |   3.0% |       69614 |          445.68 |   1.00
+    dinov2-giant-single      |    0 |   8 |    1 |          53.54 |   0.4% |   9.5% |       74646 |          432.65 |   1.00
+    dqn                      |    0 |   8 |    1 | 23089954554.91 |   1.1% |  89.9% |       62106 | 184480810548.20 |   1.00
+    bf16                     |    0 |   8 |    1 |         293.43 |   0.2% |   6.3% |        1788 |         2361.16 |   0.00
+    fp16                     |    0 |   8 |    1 |         289.26 |   0.1% |   3.6% |        1788 |         2321.65 |   0.00
+    fp32                     |    0 |   8 |    1 |          19.14 |   0.0% |   0.7% |        2166 |          153.21 |   0.00
+    tf32                     |    0 |   8 |    1 |         146.63 |   0.1% |   3.6% |        2166 |         1177.04 |   0.00
+    bert-fp16                |    0 |   8 |    1 |         263.73 |   1.1% |  16.7% |         nan |         2165.37 |   0.00
+    bert-fp32                |    0 |   8 |    1 |          44.84 |   0.6% |   9.6% |       21170 |          364.52 |   0.00
+    bert-tf32                |    0 |   8 |    1 |         141.95 |   0.9% |  14.1% |        1764 |         1162.94 |   0.00
+    bert-tf32-fp16           |    0 |   8 |    1 |         265.04 |   1.0% |  15.6% |         nan |         2175.59 |   3.00
+    reformer                 |    0 |   8 |    1 |          62.29 |   0.3% |   6.0% |       25404 |          501.89 |   1.00
+    t5                       |    0 |   8 |    1 |          51.40 |   0.5% |   9.9% |       34390 |          416.14 |   2.00
+    whisper                  |    0 |   8 |    1 |         481.95 |   1.0% |  21.4% |        8520 |         3897.53 |   1.00
+    lightning                |    0 |   8 |    1 |         680.22 |   1.0% |  22.7% |       27360 |         5506.90 |   1.00
+    lightning-gpus           |    0 |   1 |    8 |        3504.74 |   7.9% |  62.9% |       28184 |         3504.74 |   1.00
+    llava-single             |    1 |   8 |    1 |           2.28 |   0.4% |   9.6% |       72556 |           14.12 |   1.00
+    llama                    |    0 |   8 |    1 |         484.86 |   4.4% |  80.0% |       27820 |         3680.86 |   1.00
+    llm-full-mp-gpus         |    0 |   1 |    8 |         193.92 |   3.1% |  16.2% |       48470 |          193.92 |   1.00
+    llm-lora-ddp-gpus        |    0 |   1 |    8 |       16738.58 |   0.4% |   2.0% |       36988 |        16738.58 |   1.00
+    llm-lora-mp-gpus         |    0 |   1 |    8 |        1980.63 |   2.2% |  11.8% |       55972 |         1980.63 |   1.00
+    llm-lora-single          |    0 |   8 |    1 |        2724.95 |   0.2% |   3.0% |       49926 |        21861.99 |   1.00
+    ppo                      |    0 |   8 |    1 |     3114264.32 |   1.6% |  57.2% |       62206 |     24915954.98 |   1.00
+    recursiongfn             |    0 |   8 |    1 |        7080.67 |   1.2% |  27.1% |       10292 |        57038.34 |   1.00
+    rlhf-gpus                |    0 |   1 |    8 |        6314.94 |   2.1% |  11.2% |       21730 |         6314.94 |   1.00
+    rlhf-single              |    0 |   8 |    1 |        1143.72 |   0.4% |   8.4% |       19566 |         9174.52 |   1.00
+    focalnet                 |    0 |   8 |    1 |         375.07 |   0.7% |  14.9% |       23536 |         3038.83 |   2.00
+    torchatari               |    0 |   8 |    1 |        5848.88 |   0.6% |  12.7% |        3834 |        46613.34 |   1.00
+    convnext_large-fp16      |    0 |   8 |    1 |         330.93 |   1.5% |  22.9% |       27376 |         2711.46 |   0.00
+    convnext_large-fp32      |    0 |   8 |    1 |          59.49 |   0.6% |   9.8% |       55950 |          483.84 |   0.00
+    convnext_large-tf32      |    0 |   8 |    1 |         155.41 |   0.9% |  14.3% |       49650 |         1273.31 |   0.00
+    convnext_large-tf32-fp16 |    0 |   8 |    1 |         322.28 |   1.6% |  24.5% |       27376 |         2637.88 |   3.00
+    regnet_y_128gf           |    0 |   8 |    1 |         119.46 |   0.5% |  10.0% |       29762 |          966.96 |   2.00
+    resnet152-ddp-gpus       |    0 |   1 |    8 |        3843.06 |   5.2% |  39.3% |       27980 |         3843.06 |   0.00
+    resnet50                 |    0 |   8 |    1 |         932.95 |   2.4% |  52.2% |       14848 |         7524.25 |   1.00
+    resnet50-noio            |    0 |   8 |    1 |        1163.88 |   0.3% |   6.7% |       27480 |         9385.35 |   0.00
+    vjepa-gpus               |    0 |   1 |    8 |         130.13 |   5.9% |  46.8% |       64244 |          130.13 |   1.00
+    vjepa-single             |    0 |   8 |    1 |          21.29 |   1.0% |  22.4% |       58552 |          172.11 |   1.00
+  
+    Scores
+    ------
+    Failure rate:       0.38% (PASS)
+    Score:            4175.57
+ 
+   Errors
+   ------
+   1 errors, details in HTML report.

From ab6b4129e3b957e51de5dbf6647b2f8d10cfb5d7 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 20:59:32 -0500
Subject: [PATCH 17/20] Add missing tags to tests config (#312)

* Add missing tags to tests config

* Add revision

* Add regression files

* update test_memory_tracking

---------

Co-authored-by: Pierre Delaunay <pierre@delaunay.io>
---
 benchmarks/llava/main.py                      |   6 +-
 benchmarks/llava/prepare.py                   |   6 +-
 milabench/sizer.py                            |   9 +-
 milabench/system.py                           |   5 +
 tests/config/argerror.yaml                    |   2 +
 tests/config/benchio.yaml                     |   4 +-
 tests/config/benchio_bad.yaml                 |   3 +-
 tests/config/scaling.yaml                     |   2 +
 .../test_command_reg_one_node.txt             | 619 ++++++++++++++++
 .../test_command_reg_two_nodes.txt            | 676 ++++++++++++++++++
 tests/test_scaler.py                          |  41 +-
 tests/test_summary/test_compare.txt           |   4 +-
 tests/test_summary/test_report.txt            |   2 +-
 .../test_report_folder_does_average.txt       |   2 +-
 tests/test_validation.py                      |  71 +-
 15 files changed, 1388 insertions(+), 64 deletions(-)
 create mode 100644 tests/test_command_reg/test_command_reg_one_node.txt
 create mode 100644 tests/test_command_reg/test_command_reg_two_nodes.txt

diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py
index 233ae2eb..6c49b04a 100755
--- a/benchmarks/llava/main.py
+++ b/benchmarks/llava/main.py
@@ -62,8 +62,12 @@ def main():
         "llava-hf/llava-1.5-7b-hf",
         torch_dtype=torch.bfloat16,
         device_map=compat.device_type,
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
+    )
+    processor = AutoProcessor.from_pretrained(
+        "llava-hf/llava-1.5-7b-hf",
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
     )
-    processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 
     # Load dataset and create DataLoader
     dataset = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"]
diff --git a/benchmarks/llava/prepare.py b/benchmarks/llava/prepare.py
index afa480b8..5e8b018f 100755
--- a/benchmarks/llava/prepare.py
+++ b/benchmarks/llava/prepare.py
@@ -11,8 +11,12 @@ def main():
         "llava-hf/llava-1.5-7b-hf",
         torch_dtype=torch.float32,  # Change to float32
         device_map="auto",
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
+    )
+    _ = AutoProcessor.from_pretrained(
+        "llava-hf/llava-1.5-7b-hf",
+        revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb"
     )
-    _ = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 
     # Load dataset and create DataLoader
     _ = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"]
diff --git a/milabench/sizer.py b/milabench/sizer.py
index 00d6d2b6..4bd62bc7 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -53,9 +53,10 @@ def to_octet(value: str) -> float:
 class Sizer:
     """Automatically scale the batch size to match GPU spec"""
 
-    def __init__(self, scaling_config=None):
+    def __init__(self, sizer=None, scaling_config=option("sizer.config", etype=str)):
         self.path = scaling_config
-
+        self.sizer_override = sizer
+        
         if scaling_config is None:
             scaling_config = default_scaling_config
 
@@ -64,6 +65,8 @@ def __init__(self, scaling_config=None):
             
     @property
     def options(self):
+        if self.sizer_override:
+            return self.sizer_override
         return SizerOptions()
 
     def benchscaling(self, benchmark):
@@ -252,7 +255,7 @@ class MemoryUsageExtractor(ValidationLayer):
     def __init__(self):
         
         self.filepath = option("sizer.save", str, None)
-        sizer = batch_sizer()
+        sizer = Sizer()
         self.memory = deepcopy(sizer.scaling_config)
         self.scaling = None
         self.benchname = None
diff --git a/milabench/system.py b/milabench/system.py
index bd8298c6..691d06bd 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -121,6 +121,11 @@ def apply_system(config: dict):
     system = system_global.get()
     old = deepcopy(system)
     
+    if system is None:
+        system = dict()
+        system_global.set(system)
+        system = system_global.get()
+    
     for k, v in config.items():
         frags = k.split(".")
         
diff --git a/tests/config/argerror.yaml b/tests/config/argerror.yaml
index 49ad733c..59041b72 100644
--- a/tests/config/argerror.yaml
+++ b/tests/config/argerror.yaml
@@ -9,3 +9,5 @@ benchio:
     n: 1
   argv:
     --start: 0
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/config/benchio.yaml b/tests/config/benchio.yaml
index f2c694e2..50c352ca 100644
--- a/tests/config/benchio.yaml
+++ b/tests/config/benchio.yaml
@@ -4,4 +4,6 @@ benchio:
   weight: 2
   plan:
     method: njobs
-    n: 2
\ No newline at end of file
+    n: 2
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/config/benchio_bad.yaml b/tests/config/benchio_bad.yaml
index ac0b2f82..51b15ac4 100644
--- a/tests/config/benchio_bad.yaml
+++ b/tests/config/benchio_bad.yaml
@@ -8,4 +8,5 @@ benchio:
 
   argv:
     --bad: true
-
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/config/scaling.yaml b/tests/config/scaling.yaml
index 664996f7..3f3b032e 100644
--- a/tests/config/scaling.yaml
+++ b/tests/config/scaling.yaml
@@ -5,3 +5,5 @@ benchio:
     64: 12Go
     128: 24Go
     256: 48Go
+  tags:
+    - monogpu
\ No newline at end of file
diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt
new file mode 100644
index 00000000..af21f4cd
--- /dev/null
+++ b/tests/test_command_reg/test_command_reg_one_node.txt
@@ -0,0 +1,619 @@
+#!/bin/sh
+
+echo "---"
+echo "Virtual Env"
+echo "==========="
+export VIRTUAL_ENV=$BASE/venv/torch
+
+source $VIRTUAL_ENV/bin/activate
+echo "---"
+echo "Milabench"
+echo "========="
+export MILABENCH_DIR_BASE=$BASE
+export MILABENCH_DIR_VENV=$BASE/venv/torch
+export MILABENCH_DIR_DATA=$BASE/data
+export MILABENCH_DIR_RUNS=$BASE/runs
+export MILABENCH_DIR_EXTRA=$BASE/extra/llm
+export MILABENCH_DIR_CACHE=$BASE/cache
+export OMP_NUM_THREADS=0
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 3600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+
+echo "---"
+echo "llama"
+echo "====="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "fp16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  wait
+)
+
+echo "---"
+echo "bf16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  wait
+)
+
+echo "---"
+echo "tf32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  wait
+)
+
+echo "---"
+echo "fp32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  wait
+)
+
+echo "---"
+echo "resnet50"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet50-noio"
+echo "============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet152-ddp-gpus"
+echo "=================="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp16"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32-fp16"
+echo "========================"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "regnet_y_128gf"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "bert-fp32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-fp16"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32-fp16"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "t5"
+echo "=="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  wait
+)
+
+echo "---"
+echo "reformer"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "whisper"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "focalnet"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 &
+  wait
+)
+
+echo "---"
+echo "brax"
+echo "===="
+time (
+  python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 &
+  wait
+)
+
+echo "---"
+echo "diffusion-single"
+echo "================"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "diffusion-gpus"
+echo "=============="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "diffusion-nodes"
+echo "==============="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "lightning"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "lightning-gpus"
+echo "=============="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-single"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "llm-lora-single"
+echo "==============="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-ddp-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-ddp-nodes"
+echo "=================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-full-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-full-mp-nodes"
+echo "================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "dqn"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  wait
+)
+
+echo "---"
+echo "ppo"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  wait
+)
+
+echo "---"
+echo "pna"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "dimenet"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "recursiongfn"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  wait
+)
+
+echo "---"
+echo "torchatari"
+echo "=========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  wait
+)
+
+echo "---"
+echo "llava-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  wait
+)
+
+echo "---"
+echo "rlhf-single"
+echo "==========="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "rlhf-gpus"
+echo "========="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "vjepa-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  wait
+)
+
+echo "---"
+echo "vjepa-gpus"
+echo "=========="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
+  wait
+)
+
diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt
new file mode 100644
index 00000000..5e516e3f
--- /dev/null
+++ b/tests/test_command_reg/test_command_reg_two_nodes.txt
@@ -0,0 +1,676 @@
+#!/bin/sh
+
+echo "---"
+echo "Virtual Env"
+echo "==========="
+export VIRTUAL_ENV=$BASE/venv/torch
+
+source $VIRTUAL_ENV/bin/activate
+echo "---"
+echo "Milabench"
+echo "========="
+export MILABENCH_DIR_BASE=$BASE
+export MILABENCH_DIR_VENV=$BASE/venv/torch
+export MILABENCH_DIR_DATA=$BASE/data
+export MILABENCH_DIR_RUNS=$BASE/runs
+export MILABENCH_DIR_EXTRA=$BASE/extra/llm
+export MILABENCH_DIR_CACHE=$BASE/cache
+export OMP_NUM_THREADS=0
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 3600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+
+echo "---"
+echo "llama"
+echo "====="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "fp16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 &
+  wait
+)
+
+echo "---"
+echo "bf16"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 &
+  wait
+)
+
+echo "---"
+echo "tf32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 &
+  wait
+)
+
+echo "---"
+echo "fp32"
+echo "===="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 &
+  wait
+)
+
+echo "---"
+echo "resnet50"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet50-noio"
+echo "============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "resnet152-ddp-gpus"
+echo "=================="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-fp16"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "convnext_large-tf32-fp16"
+echo "========================"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 &
+  wait
+)
+
+echo "---"
+echo "regnet_y_128gf"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "bert-fp32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-fp16"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "bert-tf32-fp16"
+echo "=============="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "t5"
+echo "=="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 &
+  wait
+)
+
+echo "---"
+echo "reformer"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 &
+  wait
+)
+
+echo "---"
+echo "whisper"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 &
+  wait
+)
+
+echo "---"
+echo "focalnet"
+echo "========"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 &
+  wait
+)
+
+echo "---"
+echo "brax"
+echo "===="
+time (
+  python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 &
+  wait
+)
+
+echo "---"
+echo "diffusion-single"
+echo "================"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "diffusion-gpus"
+echo "=============="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+echo "---"
+echo "diffusion-nodes"
+echo "==============="
+time (
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+is_local 127.0.0.1 True
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+is_local 192.168.0.11 False
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache &
+  wait
+)
+
+echo "---"
+echo "lightning"
+echo "========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "lightning-gpus"
+echo "=============="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-single"
+echo "==================="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "dinov2-giant-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  wait
+)
+
+echo "---"
+echo "llm-lora-single"
+echo "==============="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-ddp-gpus"
+echo "================="
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+echo "---"
+echo "llm-lora-ddp-nodes"
+echo "=================="
+time (
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+is_local 127.0.0.1 True
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+is_local 192.168.0.11 False
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-lora-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "llm-full-mp-gpus"
+echo "================"
+time (
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+echo "---"
+echo "llm-full-mp-nodes"
+echo "================="
+time (
+0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1
+is_local 127.0.0.1 True
+1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11
+is_local 192.168.0.11 False
+is_local 127.0.0.1 True
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+is_local 192.168.0.11 False
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda &
+  wait
+)
+
+echo "---"
+echo "dqn"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 &
+  wait
+)
+
+echo "---"
+echo "ppo"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 &
+  wait
+)
+
+echo "---"
+echo "pna"
+echo "==="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "dimenet"
+echo "======="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 &
+  wait
+)
+
+echo "---"
+echo "recursiongfn"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 &
+  wait
+)
+
+echo "---"
+echo "torchatari"
+echo "=========="
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 &
+  wait
+)
+
+echo "---"
+echo "llava-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 &
+  wait
+)
+
+echo "---"
+echo "rlhf-single"
+echo "==========="
+time (
+  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "rlhf-gpus"
+echo "========="
+time (
+  $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
+  wait
+)
+
+echo "---"
+echo "vjepa-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  wait
+)
+
+echo "---"
+echo "vjepa-gpus"
+echo "=========="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
+  wait
+)
+
diff --git a/tests/test_scaler.py b/tests/test_scaler.py
index f00a8979..07cdb2ed 100644
--- a/tests/test_scaler.py
+++ b/tests/test_scaler.py
@@ -76,28 +76,27 @@ def fakeexec(pack):
 
 def test_scaler_enabled(multipack, config):
     from milabench.system import system_global
-    import contextvars
-
-    ctx = contextvars.copy_context()
-
-    def update_ctx():
-        sizer = Sizer(
-            SizerOptions(
-                size=None,
-                autoscale=True,
-                multiple=8,
-            ),
-            config("scaling"),
-        )
-        sizer_global.set(sizer)
-        system = system_global.get()
-        gpu = system.setdefault("gpu", dict())
-        gpu["capacity"] = "41920 MiB"
-
-    ctx.run(update_ctx)
+    from milabench.system import apply_system
+
+    conf = {
+        "gpu": {
+            "capacity": "41920 MiB"
+        },
+        "options": {
+            "sizer": {
+                "multiple": 8
+            }
+        }
+    }
 
     for k, pack in multipack.packs.items():
-        assert ctx.run(lambda: fakeexec(pack)) == ["--batch_size", "232"]
+        # Sizer is only enabled when config is applied
+        assert fakeexec(pack) == []
+
+    with apply_system(conf):
+        for k, pack in multipack.packs.items():
+            fakeexec(pack) == ["--batch_size", "232"]
 
-        # Sizer is only enabled inside the context
+    for k, pack in multipack.packs.items():
+        # Sizer is only enabled when config is applied
         assert fakeexec(pack) == []
diff --git a/tests/test_summary/test_compare.txt b/tests/test_summary/test_compare.txt
index c4dd7f6d..c3bb5bf0 100644
--- a/tests/test_summary/test_compare.txt
+++ b/tests/test_summary/test_compare.txt
@@ -1,5 +1,5 @@
                                        |   rijubigo |   sedumoje
-                                       | 2023-03-24 | 2023-03-24
-bench                |          metric |   13:45:27 |   13:57:35
+                                       | 2024-08-23 | 2024-08-23
+bench                |          metric |   09:22:03 |   09:22:03
 ----------------------------------------------------------------
 benchio              |      train_rate |    8780.41 |    8286.03
diff --git a/tests/test_summary/test_report.txt b/tests/test_summary/test_report.txt
index b9f6ce02..2f4d3fe4 100644
--- a/tests/test_summary/test_report.txt
+++ b/tests/test_summary/test_report.txt
@@ -11,4 +11,4 @@ benchio |    0 |   4 |    0 |    7979.82 |   2.9% |  17.2% |         nan |    79
 Scores
 ------
 Failure rate:       0.00% (PASS)
-Score:            7979.82
+Score:            7980.82
diff --git a/tests/test_summary/test_report_folder_does_average.txt b/tests/test_summary/test_report_folder_does_average.txt
index 9fda7a9c..8884a73a 100644
--- a/tests/test_summary/test_report_folder_does_average.txt
+++ b/tests/test_summary/test_report_folder_does_average.txt
@@ -11,4 +11,4 @@ benchio |    0 |   6 |    0 |    7878.45 |   2.5% |  18.0% |       24456 |    78
 Scores
 ------
 Failure rate:       0.00% (PASS)
-Score:            7878.45
+Score:            7879.45
diff --git a/tests/test_validation.py b/tests/test_validation.py
index d5f1007b..9ed9000a 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -76,39 +76,46 @@ def test_planning_layer_per_gpu_bad(replayfolder, monkeypatch):
 
 def test_memory_tracking(replayfolder, config, tmp_path):
     import contextvars
-
-    from milabench.sizer import (
-        MemoryUsageExtractor,
-        Sizer,
-        SizerOptions,
-        sizer_global,
-        system_global,
-    )
-
-    ctx = contextvars.copy_context()
-
-    def update_ctx():
-        sizer = Sizer(
-            SizerOptions(
-                size=None,
-                autoscale=True,
-                multiple=8,
-            ),
-            config("scaling"),
+    import yaml
+    from milabench.system import apply_system, option
+    
+    conf = {
+        "gpu": {
+            "capacity": "41920 MiB"
+        },
+        "options": {
+            "sizer": {
+                "multiple": 8,
+                "autoscale": 1
+            }
+        }
+    }
+    
+    with apply_system(conf):
+        from milabench.sizer import (
+            MemoryUsageExtractor,
+            Sizer,
+            SizerOptions,
+            sizer_global,
+            system_global,
         )
-        sizer_global.set(sizer)
-        system_global.set({"gpu": {"capacity": "41920 MiB"}})
-
-    ctx.run(update_ctx)
-    layer = ctx.run(lambda: MemoryUsageExtractor())
-
-    layer.filepath = f"{tmp_path}/dummy"
-
-    assert 123 not in layer.memory["benchio"]["model"]
-
-    ctx.run(lambda: replay_validation_scenario(replayfolder, layer, filename="usage"))
-
-    assert 123 in layer.memory["benchio"]["model"]
+        
+        layer = MemoryUsageExtractor()
+        with open(config("scaling"), "r") as sconf:
+            layer.memory = yaml.safe_load(sconf)
+            
+        layer.filepath = f"{tmp_path}/dummy"
+
+        print(system_global.get())
+        # print(option("sizer.multiple", etype=int))
+        # print(option("sizer.config", etype=str))
+        # print(Sizer().scaling_config)
+        assert 123 not in layer.memory["benchio"]["model"]
+
+        replay_validation_scenario(replayfolder, layer, filename="usage")
+
+        # print(layer.memory)
+        assert 123 in layer.memory["benchio"]["model"]
 
 
 def test_exception_tracking(replayfolder, file_regression, capsys):

From b0cfe430222d68b482c192c35e032668cbbbc8c3 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 21:16:28 -0500
Subject: [PATCH 18/20] Update Dockerfile-rocm

---
 docker/Dockerfile-rocm | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/docker/Dockerfile-rocm b/docker/Dockerfile-rocm
index 50466084..2290c40e 100644
--- a/docker/Dockerfile-rocm
+++ b/docker/Dockerfile-rocm
@@ -1,3 +1,4 @@
+
 FROM ubuntu:22.04
 
 # Arguments
@@ -10,16 +11,17 @@ ARG CONFIG=standard.yaml
 ENV MILABENCH_CONFIG_NAME=$CONFIG
 ENV MILABENCH_DOCKER=1
 
+ARG PYTHON="3.10"
+
 
 # Paths
 # -----
 
 ENV MILABENCH_CONFIG=/milabench/milabench/config/$MILABENCH_CONFIG_NAME
 ENV MILABENCH_BASE=/milabench/envs
-ENV MILABENCH_OUTPUT=/milabench/results/
 ENV MILABENCH_ARGS=""
-ENV CONDA_PATH=/opt/anaconda
-
+ENV MILABENCH_OUTPUT="$MILABENCH_BASE/runs"
+ENV BENCHMARK_VENV="$MILABENCH_BASE/venv"
 
 # Copy milabench
 # --------------
@@ -37,22 +39,13 @@ COPY . /milabench/milabench/
 # build-essential: for rust
 
 RUN apt-get update &&\
-    apt-get install -y git build-essential curl &&\
+    apt-get install -y git build-essential curl python3.10 python-is-python3 python3-pip &&\
     apt-get clean &&\
     rm -rf /var/lib/apt/lists/*
 
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
-# Install Python
-# --------------
-
-# Install anaconda because milabench will need it later anyway
-RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py39_23.1.0-1-Linux-x86_64.sh -o ~/miniconda.sh && \
-    /bin/bash ~/miniconda.sh -b -p $CONDA_PATH && rm ~/miniconda.sh
-ENV PATH=$CONDA_PATH/bin:$PATH
-
-
 # Install Milabench
 # -----------------
 

From 14d628a49f276921508b2f8a8a87a1d5521d8ed9 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 21:23:26 -0500
Subject: [PATCH 19/20] Update README.md

---
 README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a2f8ce50..6731e52f 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,4 @@ We are working on validating it on more configurations and will update the above
     ------
     Failure rate:       0.38% (PASS)
     Score:            4175.57
- 
-   Errors
-   ------
-   1 errors, details in HTML report.
+

From 665390198fb59c19803e49cedb8c883191ff1e8b Mon Sep 17 00:00:00 2001
From: "pierre.delaunay" <delaunap@rtx5.server.mila.quebec>
Date: Thu, 21 Nov 2024 22:23:29 -0500
Subject: [PATCH 20/20] Revert HPU changes

---
 benchmarks/llm/configs/llama3_70B_full.yaml   |  8 ++--
 .../llm/recipes/full_finetune_distributed.py  | 45 +++++++++++--------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml
index 703eb876..ae5cf2af 100644
--- a/benchmarks/llm/configs/llama3_70B_full.yaml
+++ b/benchmarks/llm/configs/llama3_70B_full.yaml
@@ -82,7 +82,7 @@ optimizer:
   foreach: False
   # Note: highly recommended to use fused=True optimizer flag
   # with CPU offload for faster optimizer step.
-  fused: true
+  fused: True
 
 loss:
   _component_: torch.nn.CrossEntropyLoss
@@ -94,9 +94,9 @@ gradient_accumulation_steps: 1
 device: cuda
 
 # Memory management
-enable_activation_checkpointing: true
-memory_efficient_fsdp_wrap: true
-fsdp_cpu_offload: true
+enable_activation_checkpointing: True
+memory_efficient_fsdp_wrap: True
+fsdp_cpu_offload: True
 
 # Reduced precision
 dtype: bf16
diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py
index f8d58e2f..3a51842d 100755
--- a/benchmarks/llm/recipes/full_finetune_distributed.py
+++ b/benchmarks/llm/recipes/full_finetune_distributed.py
@@ -16,7 +16,6 @@
 import torch
 from omegaconf import DictConfig, ListConfig
 
-import torchcompat.core as acc
 from torch import nn
 from torch.distributed import init_process_group
 from torch.distributed.fsdp import (
@@ -39,8 +38,6 @@
 
 log = utils.get_logger("DEBUG")
 
-HPU_UNSUPPORTED = False
-
 
 class FullFinetuneRecipeDistributed(FTRecipeInterface):
     """
@@ -100,8 +97,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        import os
-        self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0")))
+
+        self._device = utils.get_device(device=cfg.device)
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:
@@ -134,10 +131,7 @@ def __init__(self, cfg: DictConfig) -> None:
 
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
-        if HPU_UNSUPPORTED:
-            self.seed = utils.set_seed(seed=cfg.seed)
-        else:
-            self.seed = 1
+        self.seed = utils.set_seed(seed=cfg.seed)
         self.epochs_run = 0
         self.total_epochs = cfg.epochs
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
@@ -357,10 +351,8 @@ def _setup_model(
             )
 
         if self._is_rank_zero:
-            if HPU_UNSUPPORTED:
-                pass
-                #memory_stats = utils.get_memory_stats(device=self._device)
-                #utils.log_memory_stats(memory_stats)
+            memory_stats = utils.get_memory_stats(device=self._device)
+            utils.log_memory_stats(memory_stats)
 
         # synchronize before training begins
         torch.distributed.barrier()
@@ -421,7 +413,6 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            # persistent_workers=True,
             collate_fn=partial(
                 utils.padded_collate,
                 padding_idx=self._tokenizer.pad_id,
@@ -552,14 +543,31 @@ def train(self) -> None:
                         f"{curr_epoch+1}|{self.global_step}|Loss: {loss_to_log}"
                     )
 
+                    # Log per-step metrics
+                    if (
+                        self.global_step % self._log_every_n_steps == 0
+                        and self._is_rank_zero
+                    ):
+                        time_per_step = time.perf_counter() - t0
+                        log_dict = {
+                            "loss": loss_to_log,
+                            "lr": self._optimizer.param_groups[0]["lr"],
+                            "tokens_per_second_per_gpu": num_tokens / time_per_step,
+                        }
+                        if self._log_peak_memory_stats:
+                            log_dict.update(utils.get_memory_stats(device=self._device))
+                        self._metric_logger.log_dict(
+                            log_dict,
+                            step=self.global_step,
+                        )
+
                     # Reset running stats for the next step
                     running_loss = 0
                     num_tokens = 0
                     t0 = time.perf_counter()
-                    
-            print("HERE")
+
             self.epochs_run += 1
-            # self.save_checkpoint(epoch=curr_epoch)
+            self.save_checkpoint(epoch=curr_epoch)
 
     def cleanup(self) -> None:
         if self._is_rank_zero:
@@ -610,8 +618,7 @@ def recipe_main(cfg: DictConfig) -> None:
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
 
-    acc.init_process_group()
-
+    init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU