From 8e62d37035b022ab918213497dc0f139e1412f49 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 6 Nov 2023 09:37:45 -0500
Subject: [PATCH] Add flops benchmark (#169)

---
 .gitignore                                    |   2 +
 .pin/constraints-cuda-torch.txt               |  31 ++-
 .pin/constraints-rocm-torch.txt               |  43 ++--
 .../accelerate_opt/requirements.cuda.txt      |  14 +-
 .../accelerate_opt/requirements.rocm.txt      |  16 +-
 benchmarks/dlrm/requirements.cuda.txt         |  17 +-
 benchmarks/dlrm/requirements.rocm.txt         |  19 +-
 benchmarks/flops/activator                    |   7 +
 benchmarks/flops/benchfile.py                 |  19 ++
 benchmarks/flops/main.py                      | 184 ++++++++++++++++++
 benchmarks/flops/prepare.py                   |   1 +
 benchmarks/flops/requirements.cuda.txt        | 153 +++++++++++++++
 benchmarks/flops/requirements.in              |   4 +
 benchmarks/flops/requirements.rocm.txt        | 162 +++++++++++++++
 benchmarks/huggingface/requirements.cuda.txt  |   8 +-
 benchmarks/huggingface/requirements.rocm.txt  |  10 +-
 benchmarks/rwkv/requirements.cuda.txt         |   8 +-
 benchmarks/rwkv/requirements.rocm.txt         |  10 +-
 benchmarks/stargan/requirements.cuda.txt      |   8 +-
 benchmarks/stargan/requirements.rocm.txt      |  10 +-
 benchmarks/super-slomo/requirements.cuda.txt  |   8 +-
 benchmarks/super-slomo/requirements.rocm.txt  |  10 +-
 benchmarks/timm/requirements.cuda.txt         |   8 +-
 benchmarks/timm/requirements.rocm.txt         |  10 +-
 benchmarks/torchvision/requirements.cuda.txt  |   8 +-
 benchmarks/torchvision/requirements.rocm.txt  |  10 +-
 config/base.yaml                              |  56 +++++-
 config/standard.yaml                          |  16 ++
 constraints/cuda.txt                          |   3 +-
 constraints/rocm.txt                          |   2 +-
 milabench/_version.py                         |   6 +-
 milabench/dashboard/__init__.py               |   0
 milabench/dashboard/live_report.py            |   0
 milabench/dashboard/rawoutput.py              |   0
 milabench/executors.py                        |  11 ++
 milabench/schedule.py                         |  10 +-
 milabench/scripts/milabench_docker.bash       |   5 +
 milabench/scripts/milabench_run.bash          |  16 +-
 milabench/scripts/setup.bash                  | 111 +++++++++++
 39 files changed, 876 insertions(+), 140 deletions(-)
 create mode 100755 benchmarks/flops/activator
 create mode 100644 benchmarks/flops/benchfile.py
 create mode 100755 benchmarks/flops/main.py
 create mode 100755 benchmarks/flops/prepare.py
 create mode 100644 benchmarks/flops/requirements.cuda.txt
 create mode 100644 benchmarks/flops/requirements.in
 create mode 100644 benchmarks/flops/requirements.rocm.txt
 create mode 100644 milabench/dashboard/__init__.py
 create mode 100644 milabench/dashboard/live_report.py
 create mode 100644 milabench/dashboard/rawoutput.py
 create mode 100644 milabench/scripts/milabench_docker.bash
 create mode 100644 milabench/scripts/setup.bash

diff --git a/.gitignore b/.gitignore
index 8e6de4a30..18dafb9c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,5 @@ sqlite.db
 
 .no_report
 trash/
+workspace/
+slurm-*
diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt
index bb30508fa..f52d2aa9d 100644
--- a/.pin/constraints-cuda-torch.txt
+++ b/.pin/constraints-cuda-torch.txt
@@ -2,13 +2,13 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --config=pyproject.toml --output-file=.pin/constraints-cuda-torch.txt --resolver=backtracking .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/dlrm/requirements.in benchmarks/huggingface/requirements.in benchmarks/rwkv/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in
+#    pip-compile --config=pyproject.toml --output-file=.pin/constraints-cuda-torch.txt --resolver=backtracking .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/rwkv/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in
 #
 --extra-index-url https://download.pytorch.org/whl/cu118
 
 absl-py==2.0.0
     # via tensorboard
-accelerate==0.24.0
+accelerate==0.24.1
     # via -r benchmarks/accelerate_opt/requirements.in
 aiohttp==3.8.6
     # via
@@ -28,7 +28,7 @@ cachetools==5.3.2
     # via google-auth
 certifi==2023.7.22
     # via requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   aiohttp
     #   requests
@@ -57,7 +57,7 @@ executing==1.2.0
     # via varname
 fbgemm-gpu==0.5.0+cu118
     # via torchrec
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   huggingface-hub
     #   torch
@@ -82,7 +82,7 @@ giving==0.4.2
     # via
     #   ptera
     #   voir
-google-auth==2.23.3
+google-auth==2.23.4
     # via
     #   google-auth-oauthlib
     #   tensorboard
@@ -90,7 +90,7 @@ google-auth-oauthlib==1.1.0
     # via tensorboard
 graphviz==0.20.1
     # via torchviz
-grpcio==1.59.0
+grpcio==1.59.2
     # via tensorboard
 hjson==3.1.0
     # via deepspeed
@@ -116,7 +116,7 @@ lightning-utilities==0.9.0
     # via
     #   pytorch-lightning
     #   torchmetrics
-markdown==3.5
+markdown==3.5.1
     # via tensorboard
 markdown-it-py==3.0.0
     # via rich
@@ -138,7 +138,7 @@ multiprocess==0.70.15
     #   evaluate
 mypy-extensions==1.0.0
     # via typing-inspect
-networkx==3.2
+networkx==3.2.1
     # via torch
 ninja==1.11.1.1
     # via
@@ -167,7 +167,7 @@ oauthlib==3.2.2
     # via requests-oauthlib
 omegaconf==2.3.0
     # via voir
-onnx==1.14.1
+onnx==1.15.0
     # via -r benchmarks/dlrm/requirements.in
 opencv-python==4.8.1.78
     # via -r benchmarks/super-slomo/requirements.in
@@ -185,7 +185,7 @@ packaging==23.2
     #   pytorch-lightning
     #   torchmetrics
     #   transformers
-pandas==2.1.1
+pandas==2.1.2
     # via
     #   datasets
     #   evaluate
@@ -203,7 +203,7 @@ ptera==1.4.1
     # via voir
 py-cpuinfo==9.0.0
     # via deepspeed
-pyarrow==13.0.0
+pyarrow==14.0.0
     # via datasets
 pyasn1==0.5.0
     # via
@@ -294,7 +294,7 @@ tokenizers==0.14.1
     # via transformers
 torch==2.1.0+cu118
     # via
-    #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/stargan/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
     #   accelerate
     #   deepspeed
@@ -313,8 +313,8 @@ torchrec==0.5.0+cu118
     # via -r benchmarks/dlrm/requirements.in
 torchvision==0.16.0+cu118
     # via
+    #   -r benchmarks/stargan/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
-    #   -r benchmarks/torchvision/requirements.in
 torchviz==0.0.2
     # via -r benchmarks/dlrm/requirements.in
 torchx==0.6.0
@@ -340,7 +340,6 @@ typing-extensions==4.8.0
     # via
     #   huggingface-hub
     #   lightning-utilities
-    #   onnx
     #   pydantic
     #   pyre-extensions
     #   pytorch-lightning
@@ -359,9 +358,9 @@ urllib3==1.26.18
     #   torchx
 varname==0.10.0
     # via giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via
-    #   -r benchmarks/huggingface/requirements.in
+    #   -r benchmarks/stargan/requirements.in
     #   -r benchmarks/super-slomo/requirements.in
 websocket-client==1.6.4
     # via docker
diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt
index 484e52a77..c50a448fe 100644
--- a/.pin/constraints-rocm-torch.txt
+++ b/.pin/constraints-rocm-torch.txt
@@ -2,13 +2,13 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --config=pyproject.toml --output-file=.pin/constraints-rocm-torch.txt --resolver=backtracking .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/dlrm/requirements.in benchmarks/huggingface/requirements.in benchmarks/rwkv/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in
+#    pip-compile --config=pyproject.toml --output-file=.pin/constraints-rocm-torch.txt --resolver=backtracking .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/rwkv/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in
 #
 --extra-index-url https://download.pytorch.org/whl/rocm5.6/
 
 absl-py==2.0.0
     # via tensorboard
-accelerate==0.24.0
+accelerate==0.24.1
     # via -r benchmarks/accelerate_opt/requirements.in
 aiohttp==3.8.6
     # via
@@ -28,7 +28,7 @@ cachetools==5.3.2
     # via google-auth
 certifi==2023.7.22
     # via requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   aiohttp
     #   requests
@@ -59,7 +59,7 @@ executing==1.2.0
     # via varname
 fbgemm-gpu==0.5.0
     # via torchrec
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   huggingface-hub
     #   pytorch-triton-rocm
@@ -84,7 +84,7 @@ giving==0.4.2
     # via
     #   ptera
     #   voir
-google-auth==2.23.3
+google-auth==2.23.4
     # via
     #   google-auth-oauthlib
     #   tensorboard
@@ -92,7 +92,7 @@ google-auth-oauthlib==1.1.0
     # via tensorboard
 graphviz==0.20.1
     # via torchviz
-grpcio==1.59.0
+grpcio==1.59.2
     # via tensorboard
 hjson==3.1.0
     # via deepspeed
@@ -118,9 +118,9 @@ lightning-utilities==0.9.0
     # via
     #   pytorch-lightning
     #   torchmetrics
-lit==17.0.3
+lit==17.0.4
     # via pytorch-triton-rocm
-markdown==3.5
+markdown==3.5.1
     # via tensorboard
 markdown-it-py==3.0.0
     # via rich
@@ -142,7 +142,7 @@ multiprocess==0.70.15
     #   evaluate
 mypy-extensions==1.0.0
     # via typing-inspect
-networkx==3.2
+networkx==3.2.1
     # via torch
 ninja==1.11.1.1
     # via
@@ -151,7 +151,7 @@ ninja==1.11.1.1
 numpy==1.26.1
     # via
     #   -r benchmarks/dlrm/requirements.in
-    #   -r benchmarks/stargan/requirements.in
+    #   -r benchmarks/rwkv/requirements.in
     #   accelerate
     #   datasets
     #   deepspeed
@@ -172,7 +172,7 @@ oauthlib==3.2.2
     # via requests-oauthlib
 omegaconf==2.3.0
     # via voir
-onnx==1.14.1
+onnx==1.15.0
     # via -r benchmarks/dlrm/requirements.in
 opencv-python==4.8.1.78
     # via -r benchmarks/super-slomo/requirements.in
@@ -190,7 +190,7 @@ packaging==23.2
     #   pytorch-lightning
     #   torchmetrics
     #   transformers
-pandas==2.1.1
+pandas==2.1.2
     # via
     #   datasets
     #   evaluate
@@ -208,7 +208,7 @@ ptera==1.4.1
     # via voir
 py-cpuinfo==9.0.0
     # via deepspeed
-pyarrow==13.0.0
+pyarrow==14.0.0
     # via datasets
 pyasn1==0.5.0
     # via
@@ -301,8 +301,8 @@ tokenizers==0.14.1
     # via transformers
 torch==2.1.0+rocm5.6
     # via
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/flops/requirements.in
     #   accelerate
     #   deepspeed
     #   pytorch-lightning
@@ -321,8 +321,8 @@ torchrec==0.5.0
     # via -r benchmarks/dlrm/requirements.in
 torchvision==0.16.0+rocm5.6
     # via
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/flops/requirements.in
 torchviz==0.0.2
     # via -r benchmarks/dlrm/requirements.in
 torchx==0.6.0
@@ -330,7 +330,7 @@ torchx==0.6.0
 tqdm==4.66.1
     # via
     #   -r benchmarks/dlrm/requirements.in
-    #   -r benchmarks/torchvision/requirements.in
+    #   -r benchmarks/flops/requirements.in
     #   datasets
     #   deepspeed
     #   evaluate
@@ -346,7 +346,6 @@ typing-extensions==4.8.0
     # via
     #   huggingface-hub
     #   lightning-utilities
-    #   onnx
     #   pydantic
     #   pyre-extensions
     #   pytorch-lightning
@@ -365,10 +364,10 @@ urllib3==1.26.18
     #   torchx
 varname==0.10.0
     # via giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via
-    #   -r benchmarks/stargan/requirements.in
-    #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/accelerate_opt/requirements.in
+    #   -r benchmarks/flops/requirements.in
 websocket-client==1.6.4
     # via docker
 werkzeug==3.0.1
diff --git a/benchmarks/accelerate_opt/requirements.cuda.txt b/benchmarks/accelerate_opt/requirements.cuda.txt
index 563d7a85b..552fcc115 100644
--- a/benchmarks/accelerate_opt/requirements.cuda.txt
+++ b/benchmarks/accelerate_opt/requirements.cuda.txt
@@ -6,7 +6,7 @@
 #
 --extra-index-url https://download.pytorch.org/whl/cu118
 
-accelerate==0.24.0
+accelerate==0.24.1
     # via -r benchmarks/accelerate_opt/requirements.in
 aiohttp==3.8.6
     # via
@@ -37,7 +37,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -64,7 +64,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
@@ -135,7 +135,7 @@ multiprocess==0.70.15
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   evaluate
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -171,7 +171,7 @@ packaging==23.2
     #   evaluate
     #   huggingface-hub
     #   transformers
-pandas==2.1.1
+pandas==2.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -193,7 +193,7 @@ py-cpuinfo==9.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   deepspeed
-pyarrow==13.0.0
+pyarrow==14.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
@@ -313,7 +313,7 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/accelerate_opt/requirements.in
 xxhash==3.4.1
     # via
diff --git a/benchmarks/accelerate_opt/requirements.rocm.txt b/benchmarks/accelerate_opt/requirements.rocm.txt
index c6c6f501d..8fc0ca376 100644
--- a/benchmarks/accelerate_opt/requirements.rocm.txt
+++ b/benchmarks/accelerate_opt/requirements.rocm.txt
@@ -6,7 +6,7 @@
 #
 --extra-index-url https://download.pytorch.org/whl/rocm5.6/
 
-accelerate==0.24.0
+accelerate==0.24.1
     # via -r benchmarks/accelerate_opt/requirements.in
 aiohttp==3.8.6
     # via
@@ -37,7 +37,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
@@ -68,7 +68,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -113,7 +113,7 @@ jinja2==3.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -143,7 +143,7 @@ multiprocess==0.70.15
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
     #   evaluate
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -179,7 +179,7 @@ packaging==23.2
     #   evaluate
     #   huggingface-hub
     #   transformers
-pandas==2.1.1
+pandas==2.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -201,7 +201,7 @@ py-cpuinfo==9.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   deepspeed
-pyarrow==13.0.0
+pyarrow==14.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   datasets
@@ -322,7 +322,7 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/accelerate_opt/requirements.in
 xxhash==3.4.1
     # via
diff --git a/benchmarks/dlrm/requirements.cuda.txt b/benchmarks/dlrm/requirements.cuda.txt
index 438da3a4d..8e1993d6a 100644
--- a/benchmarks/dlrm/requirements.cuda.txt
+++ b/benchmarks/dlrm/requirements.cuda.txt
@@ -26,7 +26,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -50,7 +50,7 @@ fbgemm-gpu==0.5.0+cu118
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchrec
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -68,7 +68,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-google-auth==2.23.3
+google-auth==2.23.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   google-auth-oauthlib
@@ -81,7 +81,7 @@ graphviz==0.20.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchviz
-grpcio==1.59.0
+grpcio==1.59.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
@@ -105,7 +105,7 @@ lightning-utilities==0.9.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchmetrics
-markdown==3.5
+markdown==3.5.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
@@ -130,7 +130,7 @@ mypy-extensions==1.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   typing-inspect
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -150,7 +150,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-onnx==1.14.1
+onnx==1.15.0
     # via -r benchmarks/dlrm/requirements.in
 ovld==0.3.2
     # via
@@ -281,7 +281,6 @@ typing-extensions==4.8.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning-utilities
-    #   onnx
     #   pyre-extensions
     #   reactivex
     #   torch
@@ -300,7 +299,7 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/dlrm/requirements.in
 websocket-client==1.6.4
     # via
diff --git a/benchmarks/dlrm/requirements.rocm.txt b/benchmarks/dlrm/requirements.rocm.txt
index 4ce758a16..fc2a93ad0 100644
--- a/benchmarks/dlrm/requirements.rocm.txt
+++ b/benchmarks/dlrm/requirements.rocm.txt
@@ -26,7 +26,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -54,7 +54,7 @@ fbgemm-gpu==0.5.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchrec
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -72,7 +72,7 @@ giving==0.4.2
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   ptera
     #   voir
-google-auth==2.23.3
+google-auth==2.23.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   google-auth-oauthlib
@@ -85,7 +85,7 @@ graphviz==0.20.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchviz
-grpcio==1.59.0
+grpcio==1.59.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
@@ -109,11 +109,11 @@ lightning-utilities==0.9.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torchmetrics
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
-markdown==3.5
+markdown==3.5.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   tensorboard
@@ -138,7 +138,7 @@ mypy-extensions==1.0.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   typing-inspect
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -159,7 +159,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   voir
-onnx==1.14.1
+onnx==1.15.0
     # via -r benchmarks/dlrm/requirements.in
 ovld==0.3.2
     # via
@@ -291,7 +291,6 @@ typing-extensions==4.8.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   lightning-utilities
-    #   onnx
     #   pyre-extensions
     #   reactivex
     #   torch
@@ -310,7 +309,7 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/dlrm/requirements.in
 websocket-client==1.6.4
     # via
diff --git a/benchmarks/flops/activator b/benchmarks/flops/activator
new file mode 100755
index 000000000..083c28cb1
--- /dev/null
+++ b/benchmarks/flops/activator
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+venv="$1"
+shift
+
+source "$venv"/bin/activate
+exec "$@"
diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py
new file mode 100644
index 000000000..b00415f0f
--- /dev/null
+++ b/benchmarks/flops/benchfile.py
@@ -0,0 +1,19 @@
+from milabench.pack import Package
+
+
+class FlopsBenchmarch(Package):
+    base_requirements = "requirements.in"
+    prepare_script = "prepare.py"
+    main_script = "main.py"
+    
+    def build_run_plan(self) -> "execs.Executor":
+        import milabench.executors as execs
+        
+        main = self.dirs.code / self.main_script
+        pack = execs.PackExecutor(self, *self.argv, lazy=True)
+        # pack = execs.VoirExecutor(pack, cwd=main.parent)
+        pack = execs.ActivatorExecutor(pack, use_stdout=True)
+        return pack
+    
+
+__pack__ = FlopsBenchmarch
diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py
new file mode 100755
index 000000000..d72bf7186
--- /dev/null
+++ b/benchmarks/flops/main.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+import json
+import time
+import sys
+import multiprocessing
+
+import torch
+
+from voir.smuggle import SmuggleWriter
+from voir.instruments.gpu import get_gpu_info
+from voir.instruments.utils import Monitor
+
+KILO = 1e3
+MEGA = 1e6
+GIGA = 1e9
+TERA = 1e12
+EXA = 1e18
+
+
+def _worker(state, queue, func, delay):
+    import time
+
+    while state['running']:
+        queue.put(func())
+        time.sleep(delay)
+        
+class Monitor:
+    def __init__(self, delay, func):
+        self.manager = multiprocessing.Manager()
+        self.state = self.manager.dict()
+        self.state['running'] = True
+        self.results = multiprocessing.Queue()
+        self.process = multiprocessing.Process(
+            target=_worker, 
+            args=(self.state, self.results, func, delay),
+        )
+        
+    def start(self):
+        self.process.start()
+        
+    def stop(self):
+        self.state['running'] = False
+        self.process.join()
+
+
+def modelflops(model: torch.nn.Module, shape, repeat=10, dtype=torch.float32, unit=TERA):
+    # Not sure how much thop is correct in its computation
+    # it says it return MAC but I feel its methods is wrong
+    from thop import profile
+    
+    # MAC: Multiply–accumulate operation
+    batch = torch.randn(*shape, dtype=dtype, device="cuda:0")
+
+    flops, _ = profile(model, inputs=(batch,))
+
+    with torch.no_grad():
+        # Prepare
+        torch.cuda.empty_cache()
+
+        batch = batch.cuda()
+        model = model.to(dtype=dtype, device="cuda:0")
+
+        torch.cuda.synchronize()
+
+        # Start
+        start = time.time()
+
+        for i in range(repeat):
+            _ = model(batch)
+
+        torch.cuda.synchronize()
+        end = time.time()
+        # --
+
+    return (flops * repeat) / (end - start) / unit
+
+
+
+def f(N, R=30, m=5000000, n=256, unit=TERA, dtype=torch.float32, log=None):
+    torch.cuda.empty_cache()
+    a = torch.eye(n, dtype=dtype, device="cuda:0")
+    x = torch.randn((m, n), dtype=dtype, device="cuda:0")
+    y = torch.zeros_like(x)
+
+    F = N * (2 * m * n * n + 2 * m * n * n)
+ 
+    for i in range(R): 
+        torch.cuda.synchronize()
+        ts = -time.time()
+        
+        for _ in range(N):
+            # No allocation in main loop using dual-out strategy
+            y = torch.mm(x, a, out=y)
+            x = torch.mm(y, a, out=x)
+        
+        torch.cuda.synchronize()
+        ts += time.time()
+        
+        if log is not None:
+            log({
+                "task": "train",
+                "rate": F / ts / unit,
+                "units": "Tflops"
+            })
+ 
+    torch.cuda.empty_cache()
+
+
+def setupvoir():
+    # wtf this do
+    data_file = SmuggleWriter(sys.stdout)
+    # data_file = sys.stdout
+    
+    def log(data):
+        if data_file is not None:
+            data["t"] = time.time()
+            print(json.dumps(data), file=data_file)
+            
+            while not monitor.results.empty():
+                print(json.dumps(monitor.results.get()), file=data_file)
+        
+    def monitor_fn():
+        data = {
+            gpu["device"]: {
+                "memory": [
+                    gpu["memory"]["used"], 
+                    gpu["memory"]["total"],
+                ],
+                "load": gpu["utilization"]["compute"],
+                "temperature": gpu["temperature"],
+                "power": gpu["power"]
+            }
+            for gpu in get_gpu_info()["gpus"].values()
+        }
+        return {"task": "main", "gpudata": data, "t": time.time()}
+        
+    monitor = Monitor(0.5, monitor_fn)
+    monitor.start()
+    return log, monitor
+
+    
+
+def main():
+    dtypes = {
+        'bf16': torch.bfloat16,
+        'fp16': torch.float16,
+        'fp32': torch.float32,
+    }
+        
+    parser = ArgumentParser()
+    parser.add_argument('--repeat', type=int, default=100)
+    parser.add_argument('--number', type=int, default=100)
+    parser.add_argument('--m', type=int, default=256)
+    parser.add_argument('--n', type=int, default=256)
+    parser.add_argument('--dtype', type=str, default='fp32', choices=dtypes.keys())
+    parser.add_argument('--tf32', action='store_true', default=False)
+    
+    args = parser.parse_args()
+
+    torch.backends.cuda.matmul.allow_tf32 = False
+    if args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    
+    log, monitor = setupvoir()
+
+    f(
+        args.number,
+        args.repeat,
+        args.m,
+        args.n,
+        TERA,
+        dtypes[args.dtype],
+        log
+    )
+
+    monitor.stop()
+    
+if __name__ == "__main__":
+    main()
+
+
+
diff --git a/benchmarks/flops/prepare.py b/benchmarks/flops/prepare.py
new file mode 100755
index 000000000..4265cc3e6
--- /dev/null
+++ b/benchmarks/flops/prepare.py
@@ -0,0 +1 @@
+#!/usr/bin/env python
diff --git a/benchmarks/flops/requirements.cuda.txt b/benchmarks/flops/requirements.cuda.txt
new file mode 100644
index 000000000..65c830e7a
--- /dev/null
+++ b/benchmarks/flops/requirements.cuda.txt
@@ -0,0 +1,153 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --config=pyproject.toml --output-file=benchmarks/flops/requirements.cuda.txt --resolver=backtracking .pin/tmp-constraints-cuda-flops.txt benchmarks/flops/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/cu118
+
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+certifi==2023.7.22
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+codefind==0.1.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   varname
+filelock==3.13.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+    #   triton
+fsspec==2023.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+    #   voir
+idna==3.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+jinja2==3.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
+markupsafe==2.1.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   sympy
+networkx==3.2.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+numpy==1.26.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torchvision
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+ovld==0.3.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+pillow==10.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torchvision
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+pygments==2.16.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
+pynvml==11.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+pyyaml==6.0.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+requests==2.31.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torchvision
+rich==13.6.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   asttokens
+sympy==1.12
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+torch==2.1.0+cu118
+    # via
+    #   -r benchmarks/flops/requirements.in
+    #   torchvision
+torchvision==0.16.0+cu118
+    # via -r benchmarks/flops/requirements.in
+tqdm==4.66.1
+    # via -r benchmarks/flops/requirements.in
+triton==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+typing-extensions==4.8.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   reactivex
+    #   torch
+urllib3==1.26.18
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+voir==0.2.11
+    # via -r benchmarks/flops/requirements.in
diff --git a/benchmarks/flops/requirements.in b/benchmarks/flops/requirements.in
new file mode 100644
index 000000000..7d30d94e7
--- /dev/null
+++ b/benchmarks/flops/requirements.in
@@ -0,0 +1,4 @@
+torch
+torchvision
+tqdm
+voir
diff --git a/benchmarks/flops/requirements.rocm.txt b/benchmarks/flops/requirements.rocm.txt
new file mode 100644
index 000000000..86e259787
--- /dev/null
+++ b/benchmarks/flops/requirements.rocm.txt
@@ -0,0 +1,162 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --config=pyproject.toml --output-file=benchmarks/flops/requirements.rocm.txt --resolver=backtracking .pin/tmp-constraints-rocm-flops.txt benchmarks/flops/requirements.in
+#
+--extra-index-url https://download.pytorch.org/whl/rocm5.6/
+
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+certifi==2023.7.22
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+cmake==3.27.7
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+codefind==0.1.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+executing==1.2.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   varname
+filelock==3.13.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+    #   torch
+fsspec==2023.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+giving==0.4.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   ptera
+    #   voir
+idna==3.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+jinja2==3.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+lit==17.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   pytorch-triton-rocm
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+markupsafe==2.1.3
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   markdown-it-py
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   sympy
+networkx==3.2.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+numpy==1.26.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchvision
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+ovld==0.3.2
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pillow==10.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchvision
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pygments==2.16.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   rich
+pynvml==11.5.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+pytorch-triton-rocm==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+pyyaml==6.0.1
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   omegaconf
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+requests==2.31.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torchvision
+rich==13.6.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   voir
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   asttokens
+sympy==1.12
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   torch
+torch==2.1.0+rocm5.6
+    # via
+    #   -r benchmarks/flops/requirements.in
+    #   pytorch-triton-rocm
+    #   torchvision
+torchvision==0.16.0+rocm5.6
+    # via -r benchmarks/flops/requirements.in
+tqdm==4.66.1
+    # via -r benchmarks/flops/requirements.in
+typing-extensions==4.8.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   reactivex
+    #   torch
+urllib3==1.26.18
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   requests
+varname==0.10.0
+    # via
+    #   -c .pin/../.pin/constraints-rocm-torch.txt
+    #   giving
+voir==0.2.11
+    # via -r benchmarks/flops/requirements.in
diff --git a/benchmarks/huggingface/requirements.cuda.txt b/benchmarks/huggingface/requirements.cuda.txt
index ce100130c..d4c17b767 100644
--- a/benchmarks/huggingface/requirements.cuda.txt
+++ b/benchmarks/huggingface/requirements.cuda.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -30,7 +30,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
@@ -76,7 +76,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -175,5 +175,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/huggingface/requirements.in
diff --git a/benchmarks/huggingface/requirements.rocm.txt b/benchmarks/huggingface/requirements.rocm.txt
index 8c1366c32..963defffa 100644
--- a/benchmarks/huggingface/requirements.rocm.txt
+++ b/benchmarks/huggingface/requirements.rocm.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -34,7 +34,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -64,7 +64,7 @@ jinja2==3.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -84,7 +84,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -185,5 +185,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/huggingface/requirements.in
diff --git a/benchmarks/rwkv/requirements.cuda.txt b/benchmarks/rwkv/requirements.cuda.txt
index 13e39de84..b32c5f2f4 100644
--- a/benchmarks/rwkv/requirements.cuda.txt
+++ b/benchmarks/rwkv/requirements.cuda.txt
@@ -34,7 +34,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -49,7 +49,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -108,7 +108,7 @@ multidict==6.0.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   yarl
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -223,7 +223,7 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/rwkv/requirements.in
 yarl==1.9.2
     # via
diff --git a/benchmarks/rwkv/requirements.rocm.txt b/benchmarks/rwkv/requirements.rocm.txt
index 1e7449465..bf060a8a4 100644
--- a/benchmarks/rwkv/requirements.rocm.txt
+++ b/benchmarks/rwkv/requirements.rocm.txt
@@ -34,7 +34,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
@@ -53,7 +53,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -91,7 +91,7 @@ lightning-utilities==0.9.0
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-lightning
     #   torchmetrics
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -116,7 +116,7 @@ multidict==6.0.4
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   aiohttp
     #   yarl
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -232,7 +232,7 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/rwkv/requirements.in
 yarl==1.9.2
     # via
diff --git a/benchmarks/stargan/requirements.cuda.txt b/benchmarks/stargan/requirements.cuda.txt
index 435795cc8..23572d865 100644
--- a/benchmarks/stargan/requirements.cuda.txt
+++ b/benchmarks/stargan/requirements.cuda.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -30,7 +30,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -68,7 +68,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -147,5 +147,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/stargan/requirements.in
diff --git a/benchmarks/stargan/requirements.rocm.txt b/benchmarks/stargan/requirements.rocm.txt
index bbea36f98..1e0f5eccf 100644
--- a/benchmarks/stargan/requirements.rocm.txt
+++ b/benchmarks/stargan/requirements.rocm.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -34,7 +34,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -56,7 +56,7 @@ jinja2==3.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -76,7 +76,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -156,5 +156,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/stargan/requirements.in
diff --git a/benchmarks/super-slomo/requirements.cuda.txt b/benchmarks/super-slomo/requirements.cuda.txt
index c4c93813e..657aa0053 100644
--- a/benchmarks/super-slomo/requirements.cuda.txt
+++ b/benchmarks/super-slomo/requirements.cuda.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -30,7 +30,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -68,7 +68,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -152,5 +152,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/super-slomo/requirements.in
diff --git a/benchmarks/super-slomo/requirements.rocm.txt b/benchmarks/super-slomo/requirements.rocm.txt
index 30b2bd53c..230461051 100644
--- a/benchmarks/super-slomo/requirements.rocm.txt
+++ b/benchmarks/super-slomo/requirements.rocm.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -34,7 +34,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -56,7 +56,7 @@ jinja2==3.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -76,7 +76,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -161,5 +161,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/super-slomo/requirements.in
diff --git a/benchmarks/timm/requirements.cuda.txt b/benchmarks/timm/requirements.cuda.txt
index 7a66f12e8..619a16aba 100644
--- a/benchmarks/timm/requirements.cuda.txt
+++ b/benchmarks/timm/requirements.cuda.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -30,7 +30,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
@@ -72,7 +72,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -164,5 +164,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/timm/requirements.in
diff --git a/benchmarks/timm/requirements.rocm.txt b/benchmarks/timm/requirements.rocm.txt
index e53283a33..6c09c25b0 100644
--- a/benchmarks/timm/requirements.rocm.txt
+++ b/benchmarks/timm/requirements.rocm.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -34,7 +34,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   huggingface-hub
@@ -60,7 +60,7 @@ jinja2==3.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -80,7 +80,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -173,5 +173,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/timm/requirements.in
diff --git a/benchmarks/torchvision/requirements.cuda.txt b/benchmarks/torchvision/requirements.cuda.txt
index bb149eec3..e9740262c 100644
--- a/benchmarks/torchvision/requirements.cuda.txt
+++ b/benchmarks/torchvision/requirements.cuda.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -30,7 +30,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -68,7 +68,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
@@ -149,5 +149,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/torchvision/requirements.in
diff --git a/benchmarks/torchvision/requirements.rocm.txt b/benchmarks/torchvision/requirements.rocm.txt
index 58fbe4cd8..40a8ade9b 100644
--- a/benchmarks/torchvision/requirements.rocm.txt
+++ b/benchmarks/torchvision/requirements.rocm.txt
@@ -18,7 +18,7 @@ certifi==2023.7.22
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
-charset-normalizer==3.3.1
+charset-normalizer==3.3.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   requests
@@ -34,7 +34,7 @@ executing==1.2.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   varname
-filelock==3.12.4
+filelock==3.13.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -56,7 +56,7 @@ jinja2==3.1.2
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
-lit==17.0.3
+lit==17.0.4
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   pytorch-triton-rocm
@@ -76,7 +76,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   sympy
-networkx==3.2
+networkx==3.2.1
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   torch
@@ -158,5 +158,5 @@ varname==0.10.0
     # via
     #   -c .pin/../.pin/constraints-rocm-torch.txt
     #   giving
-voir @ git+https://github.com/breuleux/voir.git
+voir==0.2.11
     # via -r benchmarks/torchvision/requirements.in
diff --git a/config/base.yaml b/config/base.yaml
index 109d3ee10..40595c674 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -23,6 +23,22 @@ _torchvision:
     --no-stdout: true
     --epochs: 50
 
+_flops:
+  inherits: _defaults
+  definition: ../benchmarks/flops
+  group: flops
+  install_group: torch
+  plan:
+    method: per_gpu
+  
+  tags:
+    - diagnostic
+    - flops
+  
+  argv:
+    --number: 10
+    --repeat: 90
+
 _hf:
   inherits: _defaults
   definition: ../benchmarks/huggingface
@@ -86,6 +102,44 @@ _accelerate_opt:
   use_deepspeed: true
   num_machines: 1
 
+
+fp16:
+  inherits: _flops
+
+  argv:
+    --number: 30
+    --repeat: 90
+    --m: 8192
+    --n: 8192
+    --dtype: fp16
+
+
+bf16:
+  inherits: _flops
+ 
+  argv:
+    --m: 8192
+    --n: 8192
+    --dtype: bf16
+
+tf32:
+  inherits: _flops
+ 
+  argv:
+    --m: 8192
+    --n: 8192
+    --dtype: fp32
+    --tf32: true
+
+fp32:
+  inherits: _flops
+ 
+  argv:
+    --m: 8192
+    --n: 8192
+    --dtype: fp32
+  
+
 resnet50:
   inherits: _torchvision
   tags:
@@ -100,7 +154,7 @@ resnet50:
 
 efficientnet_b4:
   inherits: _torchvision
-  tags:
+
   tags:
     - vision
     - classification
diff --git a/config/standard.yaml b/config/standard.yaml
index 809f0a134..2b2363934 100644
--- a/config/standard.yaml
+++ b/config/standard.yaml
@@ -121,6 +121,22 @@ rwkv:
   enabled: true
   weight: 1.0
 
+fp16:
+  enabled: true
+  weight: 0.0
+
+bf16:
+  enabled: true
+  weight: 0.0
+
+tf32:
+  enabled: true
+  weight: 0.0
+
+fp32:
+  enabled: true
+  weight: 0.0
+
 ##################
 # Disabled tests #
 ##################
diff --git a/constraints/cuda.txt b/constraints/cuda.txt
index c21d70b5e..cb2bbd770 100644
--- a/constraints/cuda.txt
+++ b/constraints/cuda.txt
@@ -1,2 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-voir @ git+https://github.com/breuleux/voir.git
+voir > 0.2.10
+
diff --git a/constraints/rocm.txt b/constraints/rocm.txt
index 07a8feac5..9b46f6813 100644
--- a/constraints/rocm.txt
+++ b/constraints/rocm.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/rocm5.6/
-voir @ git+https://github.com/breuleux/voir.git
+voir > 0.2.10
\ No newline at end of file
diff --git a/milabench/_version.py b/milabench/_version.py
index 8fc180822..f6782f930 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.0.6-43-g89f56f6"
-__commit__ = "89f56f670db0f22880d057262a320c935c217d77"
-__date__ = "2023-10-19 19:29:36 -0400"
+__tag__ = "v0.0.6-33-ga23bd12"
+__commit__ = "a23bd123851402b63116293dc7634b231b7e21b4"
+__date__ = "2023-10-31 13:04:11 -0400"
diff --git a/milabench/dashboard/__init__.py b/milabench/dashboard/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/milabench/dashboard/live_report.py b/milabench/dashboard/live_report.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/milabench/dashboard/rawoutput.py b/milabench/dashboard/rawoutput.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/milabench/executors.py b/milabench/executors.py
index 41d308919..b109c896b 100644
--- a/milabench/executors.py
+++ b/milabench/executors.py
@@ -630,6 +630,17 @@ def __init__(self, executor: Executor, gpus: list = None, **kwargs) -> None:
         super().__init__(*executors, **kwargs)
 
 
+class ActivatorExecutor(SingleCmdExecutor):
+    def __init__(self, pack: pack.BasePackage, **kwargs):
+        super().__init__(pack, **kwargs)
+        
+    def _argv(self, **_) -> List:
+        return [
+            f"{self.pack.dirs.code / 'activator'}",
+            f"{self.pack.dirs.venv}"
+        ]
+                
+
 # Accelerate
 class AccelerateLaunchExecutor(SingleCmdExecutor):
     """Execute a `BasePackage` with Accelerate
diff --git a/milabench/schedule.py b/milabench/schedule.py
index c572e0e38..21f4b25ed 100644
--- a/milabench/schedule.py
+++ b/milabench/schedule.py
@@ -23,10 +23,10 @@ def println(line):
     ) as process:
         def readoutput():
             process.stdout.flush()
-            line = process.stdout.readline()
+            for line in process.stdout.readline():
 
-            if callback:
-                callback(line)
+                if callback:
+                    callback(line)
 
         try:
             while process.poll() is None:
@@ -130,10 +130,12 @@ def launch_milabench(args, sbatch_args=None, dry: bool = False, sync: bool = Fal
     sbatch_script = importlib_resources.files(__name__) / "scripts" / "milabench_run.bash"
     sbatch_script = str(sbatch_script)
 
+    # salloc --gres=gpu:rtx8000:1 --mem=64G --cpus-per-gpu=4
+ 
     if sbatch_args is None:
         sbatch_args = [
             "--ntasks=1",
-            "--gpus-per-task=rtx8000:1",
+            "--gpus-per-task=4g.40gb:1",
             "--cpus-per-task=4",
             "--time=01:30:00",
             "--ntasks-per-node=1",
diff --git a/milabench/scripts/milabench_docker.bash b/milabench/scripts/milabench_docker.bash
new file mode 100644
index 000000000..7a9bfcc19
--- /dev/null
+++ b/milabench/scripts/milabench_docker.bash
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+# CPU only
+
diff --git a/milabench/scripts/milabench_run.bash b/milabench/scripts/milabench_run.bash
index cf502cbf4..693a80139 100755
--- a/milabench/scripts/milabench_run.bash
+++ b/milabench/scripts/milabench_run.bash
@@ -17,11 +17,13 @@ ARCH="cuda"
 PYTHON="3.9"
 BRANCH="master"
 ORIGIN="https://github.com/mila-iqia/milabench.git"
-CONFIG="$SLURM_TMPDIR/milabench/config/standard.yaml"
-BASE="$SLURM_TMPDIR/base"
+LOC="$SLURM_TMPDIR"
+CONFIG="$LOC/milabench/config/standard.yaml"
+BASE="$LOC/base"
 ENV="./env"
 REMAINING_ARGS=""
 
+
 while getopts ":hm:p:e:b:o:c:" opt; do
   case $opt in
     h)
@@ -45,6 +47,12 @@ while getopts ":hm:p:e:b:o:c:" opt; do
     a)
         ARCH="$OPTARG"
         ;;
+    l)
+        # FIX ME
+        LOC="$OPTARG"
+        CONFIG="$LOC/milabench/config/standard.yaml"
+        BASE="$LOC/base"
+        ;;
     :)
         echo "Option -$OPTARG requires an argument." >&2
         usage
@@ -72,7 +80,7 @@ if [ -e $HOME/.credentials.env ]; then
   source $HOME/.credentials.env
 fi
 
-cd $SLURM_TMPDIR
+cd $LOC
 #
 #   Create a new environment
 #
@@ -97,7 +105,7 @@ export MILABENCH_CONFIG=$CONFIG
 git clone --single-branch --depth 1 -b $BRANCH $ORIGIN
 python -m pip install -e ./milabench
 
-SYSTEM="$SLURM_TMPDIR/system.yaml"
+SYSTEM="$LOC/system.yaml"
 
 echo ""
 echo "System"
diff --git a/milabench/scripts/setup.bash b/milabench/scripts/setup.bash
new file mode 100644
index 000000000..dd3e3f496
--- /dev/null
+++ b/milabench/scripts/setup.bash
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+function usage() {
+  echo "Usage: $0 [-m] [-p]"
+  echo "  -h              Display this help message."
+  echo "  -b arch         GPU arch           (default: cuda)"
+  echo "  -b BRANCH       Branch to checkout (default: master)"
+  echo "  -o ORIGIN       Origin to use      (default: github/mila/milabench)"
+  echo "  -c CONFIG       Configuration      (default: milabench/config/standard.yaml)"
+  echo "  -e ENV          Environment        (default: ./env)"
+  echo "  -p PYTHON       Python version     (default: 3.9)"
+  echo "  ARGUMENT        Any additional argument you want to process."
+  exit 1
+}
+
+ARCH="cuda"
+PYTHON="3.9"
+BRANCH="master"
+ORIGIN="https://github.com/mila-iqia/milabench.git"
+CONFIG="$SLURM_TMPDIR/milabench/config/standard.yaml"
+BASE="$SLURM_TMPDIR/base"
+ENV="./env"
+REMAINING_ARGS=""
+
+while getopts ":hm:p:e:b:o:c:" opt; do
+  case $opt in
+    h)
+      usage
+      ;;
+    p)
+        PYTHON="$OPTARG"
+        ;;
+    b)
+        BRANCH="$OPTARG"
+        ;;
+    o)
+        ORIGIN="$OPTARG"
+        ;;
+    c)
+        CONFIG="$OPTARG"
+        ;;
+    e)
+        ENV="$OPTARG"
+        ;;
+    a)
+        ARCH="$OPTARG"
+        ;;
+    :)
+        echo "Option -$OPTARG requires an argument." >&2
+        usage
+        ;;
+  esac
+done
+
+shift "$((OPTIND-1))"
+REMAINING_ARGS="$@"
+
+echo "  PYTHON: $PYTHON"
+echo "  branch: $BRANCH"
+echo "  origin: $ORIGIN"
+echo "  config: $CONFIG"
+echo "     env: $ENV"
+echo "    args: $REMAINING_ARGS"
+#
+#   Fix problem with conda saying it is not "init properly"
+#
+CONDA_EXEC="$(which conda)"
+CONDA_BASE=$(dirname $CONDA_EXEC)
+source $CONDA_BASE/../etc/profile.d/conda.sh
+
+if [ -e $HOME/.credentials.env ]; then
+  source $HOME/.credentials.env
+fi
+
+cd $SLURM_TMPDIR
+#
+#   Create a new environment
+#
+if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then
+     conda create --prefix $ENV python=$PYTHON -y
+fi
+conda activate $ENV
+
+export HF_HOME=$BASE/cache
+export HF_DATASETS_CACHE=$BASE/cache
+export TORCH_HOME=$BASE/cache
+export XDG_CACHE_HOME=$BASE/cache
+export MILABENCH_GPU_ARCH=$ARCH
+
+export MILABENCH_DASH=no 
+export PYTHONUNBUFFERED=1
+export MILABENCH_BASE=$BASE
+export MILABENCH_CONFIG=$CONFIG
+
+#
+# Fetch the repo
+#
+git clone --single-branch --depth 1 -b $BRANCH $ORIGIN
+python -m pip install -e ./milabench
+
+SYSTEM="$SLURM_TMPDIR/system.yaml"
+
+echo ""
+echo "System"
+echo "------"
+
+milabench slurm_system 
+milabench slurm_system > $SYSTEM
+
+module load gcc/9.3.0 
+module load cuda/11.8