From 397f127b6f5c7b5583f6184b64f5fb54d8a16b9d Mon Sep 17 00:00:00 2001
From: "pierre.delaunay" <delaunap@rtx5.server.mila.quebec>
Date: Tue, 17 Sep 2024 23:20:39 -0400
Subject: [PATCH] Add scaling config

---
 benchmarks/brax/main.py            |   3 +
 benchmarks/brax/voirfile.py        |   4 +-
 benchmarks/lightning/main.py       |   8 +-
 benchmarks/torchvision/voirfile.py |   2 +-
 benchmate/benchmate/monitor.py     |   4 +-
 config/base.yaml                   |   8 ++
 config/scaling.yaml                | 115 +++++++++++++++++++----------
 milabench/cli/list.py              |   9 ++-
 scripts/article/run_cuda.sh        |  18 ++++-
 9 files changed, 121 insertions(+), 50 deletions(-)

diff --git a/benchmarks/brax/main.py b/benchmarks/brax/main.py
index 572ce739c..6625bcd04 100644
--- a/benchmarks/brax/main.py
+++ b/benchmarks/brax/main.py
@@ -85,6 +85,9 @@ def run():
 
     args = parser.parse_args()
 
+    # args.num_envs = (args.batch_size * args.num_minibatches)  
+
+
     train(
         environment=envs.get_environment(env_name=args.env),
         num_timesteps=args.num_timesteps,
diff --git a/benchmarks/brax/voirfile.py b/benchmarks/brax/voirfile.py
index fce6f66d0..3397dcb31 100644
--- a/benchmarks/brax/voirfile.py
+++ b/benchmarks/brax/voirfile.py
@@ -20,10 +20,10 @@ class Config:
     skip: int = 5
 
     # Number of rates to log before stopping
-    stop: int = 20
+    stop: int = 60
 
     # Number of seconds between each gpu poll
-    gpu_poll: int = 3
+    gpu_poll: int = 1
 
 
 @configurable
diff --git a/benchmarks/lightning/main.py b/benchmarks/lightning/main.py
index b31f3880c..aca89ee47 100644
--- a/benchmarks/lightning/main.py
+++ b/benchmarks/lightning/main.py
@@ -40,7 +40,7 @@ def prepare_voir():
 
     observer = BenchObserver(
         accelerator.Event, 
-        earlystop=65,
+        earlystop=100,
         batch_size_fn=lambda x: len(x[0]),
         raise_stop_program=False,
         stdout=True,
@@ -73,8 +73,6 @@ def main():
 
     model = TorchvisionLightning(model)
 
-    
-   
     accelerator.set_enable_tf32(True)
 
     observer, monitor = prepare_voir()
@@ -91,10 +89,10 @@ def main():
         enable_checkpointing=False,
         enable_progress_bar=False,
         reload_dataloaders_every_n_epochs=1,
-        max_steps=100
+        max_steps=120
     )
 
-    with monitor():
+    with monitor(poll_interval=0.1):
         trainer.fit(model=model, train_dataloaders=loader)
     print("finished: ", rank)
 
diff --git a/benchmarks/torchvision/voirfile.py b/benchmarks/torchvision/voirfile.py
index ed3f0af7c..a05c99774 100644
--- a/benchmarks/torchvision/voirfile.py
+++ b/benchmarks/torchvision/voirfile.py
@@ -24,7 +24,7 @@ class Config:
     stop: int = 20
 
     # Number of seconds between each gpu poll
-    gpu_poll: int = 3
+    gpu_poll: float = 1
 
 
 @configurable
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index 5d2624201..0fe8fe025 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -17,7 +17,7 @@
 
 
 @instrument_definition
-def monitor_monogpu(ov, poll_interval=10, arch=None):
+def monitor_monogpu(ov, poll_interval=1, arch=None):
     return monitor(
         ov,
         poll_interval=poll_interval,
@@ -28,7 +28,7 @@ def monitor_monogpu(ov, poll_interval=10, arch=None):
 
 
 @instrument_definition
-def monitor_node(ov, poll_interval=10, arch=None):
+def monitor_node(ov, poll_interval=1, arch=None):
     return monitor(
         ov,
         poll_interval=poll_interval,
diff --git a/config/base.yaml b/config/base.yaml
index dd69a4954..730ef78d8 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -209,6 +209,11 @@ resnet50:
   
 resnet50-noio:
   inherits: _torchvision
+  voir:
+    options:
+      stop: 1000
+      interval: "1s"
+
   tags:
     - vision
     - classification
@@ -372,12 +377,15 @@ focalnet:
     --model: focalnet_base_lrf
 
 brax:
+  # Brax requires very specific sizes to work
+  # so the resizer is not capable of handling resizing this bench
   inherits: _defaults
   tags:
     - rl
     - jax
     - multigpu
     - gym
+    - nobatch
   definition: ../benchmarks/brax
   group: brax
   install_group: torch
diff --git a/config/scaling.yaml b/config/scaling.yaml
index 9b1dc36eb..5fb30494e 100644
--- a/config/scaling.yaml
+++ b/config/scaling.yaml
@@ -55,9 +55,10 @@ bert-tf32-fp16:
     112: 81140.75 MiB
   optimized: 128
 bf16: {}
-brax: 
-  args: --batch-size
-
+brax:
+  arg: --batch-size
+  model:
+    1024: 4912.25 MiB
 convnext_large-fp16:
   arg: --batch-size
   model:
@@ -191,14 +192,26 @@ dimenet:
 dinov2-giant-gpus:
   arg: train.batch_size_per_gpu={batch_size}
   model:
+    1: 32240.25 MiB
+    2: 32252.25 MiB
+    4: 32404.25 MiB
+    16: 38350.25 MiB
     32: 69614 MiB
   optimized: 32
-dinov2-giant-single:
-  arg: train.batch_size_per_gpu={batch_size}
 dinov2-giant-nodes:
   arg: train.batch_size_per_gpu={batch_size}
-
+dinov2-giant-single:
+  arg: train.batch_size_per_gpu={batch_size}
+  model:
+    1: 20682.25 MiB
+    2: 20682.25 MiB
+    4: 20682.25 MiB
+    16: 52748.25 MiB
+    32: 74544.25 MiB
 dlrm: {}
+dqn:
+  arg: --buffer_batch_size
+  optimized: 128
 focalnet:
   arg: --batch-size
   model:
@@ -222,6 +235,14 @@ fp16: {}
 fp32: {}
 lightning:
   arg: --batch-size
+  model:
+    1: 1054.25 MiB
+    2: 1054.25 MiB
+    4: 1856.25 MiB
+    16: 4728.25 MiB
+    32: 6352.25 MiB
+    64: 1856.25 MiB
+    128: 14818.25 MiB
 lightning-gpus:
   arg: --batch-size
   model:
@@ -233,18 +254,47 @@ lightning-gpus:
     128: 15858 MiB
   optimized: 16
 llama: {}
+llava-gpus:
+  arg: --batch_size
+  optimized: 1
+llava-single:
+  arg: --batch_size
+  optimized: 1
 llm-full-mp-gpus:
   arg: batch_size={batch_size}
+  model:
+    1: 48964.25 MiB
+    2: 49214.25 MiB
+    4: 51310.25 MiB
+    16: 81536.25 MiB
 llm-full-mp-nodes:
   arg: batch_size={batch_size}
+  model:
+    1: 37340.25 MiB
+    2: 38112.25 MiB
+    4: 39110.25 MiB
+    16: 80638.25 MiB
 llm-lora-ddp-gpus:
   arg: batch_size={batch_size}
   model:
     1: 12418.75 MiB
+    2: 19026.25 MiB
+    4: 25464.25 MiB
+    16: 55834.25 MiB
+    32: 80268.25 MiB
 llm-lora-ddp-nodes:
   arg: batch_size={batch_size}
+  model:
+    2: 17202.25 MiB
+    4: 23956.25 MiB
+    16: 59730.25 MiB
+    32: 68932.25 MiB
 llm-lora-mp-gpus:
   arg: batch_size={batch_size}
+  model:
+    2: 38166.25 MiB
+    4: 43464.25 MiB
+    16: 77116.25 MiB
 llm-lora-single:
   arg: batch_size={batch_size}
   model:
@@ -268,6 +318,9 @@ opt-6_7b-multinode:
   model:
     1: 55380 MiB
   optimized: 1
+ppo:
+  arg: --num_minibatches
+  optimized: 32
 recursiongfn:
   arg: --batch_size
   model:
@@ -382,6 +435,18 @@ resnet50:
   optimized: 64
 resnet50-noio:
   arg: --batch-size
+  model:
+    1: 1594.25 MiB
+    2: 1652.25 MiB
+    4: 1854.25 MiB
+    16: 3052.25 MiB
+    32: 4690.25 MiB
+rlhf-gpus:
+  arg: --per_device_train_batch_size
+  optimized: 64
+rlhf-single:
+  arg: --per_device_train_batch_size
+  optimized: 64
 rwkv:
   arg: --micro_bsz
   model:
@@ -432,6 +497,12 @@ torchatari:
     1: 1124.75 MiB
     2: 1138.75 MiB
     4: 1166.75 MiB
+vjepa-gpus:
+  arg: --batch_size
+  optimized: 24
+vjepa-single:
+  arg: --batch_size
+  optimized: 24
 whisper:
   arg: --batch-size
   model:
@@ -448,35 +519,3 @@ whisper:
     128: 71634.375 MiB
     144: 80412.75 MiB
   optimized: 128
-
-llava-single:
-  arg: --batch_size
-  optimized: 1
-
-llava-gpus:
-  arg: --batch_size
-  optimized: 1
-
-rlhf-single:
-  arg: --per_device_train_batch_size
-  optimized: 64
-
-rlhf-gpus:
-  arg: --per_device_train_batch_size
-  optimized: 64
-
-vjepa-single:
-  arg: --batch_size
-  optimized: 24
-
-vjepa-gpus:
-  arg: --batch_size
-  optimized: 24
-
-ppo:
-  arg: --num_minibatches
-  optimized: 32
-
-dqn:
-  arg: --buffer_batch_size
-  optimized: 128
\ No newline at end of file
diff --git a/milabench/cli/list.py b/milabench/cli/list.py
index bfba35f9c..fda73bdf5 100644
--- a/milabench/cli/list.py
+++ b/milabench/cli/list.py
@@ -42,7 +42,14 @@ def add_bench(k, tags):
         else:
             add_bench(k, tags)
 
-    print(",".join(missing_benches))
+
+
+    b = [f"\"{b}\"" for b in missing_benches]
+
+    
+
+
+    print(" ".join(b))
 
 
 if __name__ == "__main__":
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index b7b31eed3..99ebae544 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -84,14 +84,30 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
 
     . $MILABENCH_WORDIR/env/bin/activate
 
+    # milabench install --system $MILABENCH_WORDIR/system.yaml
+    # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     # pip install torch
     # milabench pin --variant cuda --from-scratch $ARGS 
     # milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS
+    ARGS="--select resnet50-noio,brax,lightning,dinov2-giant-single,dinov2-giant-gpus,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-full-mp-gpus,llm-full-mp-nodes,dqn,ppo,dimenet,llava-single,rlhf-single,rlhf-gpus,vjepa-single,vjepa-gpus"
 
+    # MEMORY_CAPACITY=("4Go" "8Go" "16Go" "32Go" "64Go" "80Go")
+    # MILABENCH_SIZER_MULTIPLE=16
+    # MILABENCH_SIZER_CAPACITY="$CAPACITY"
+
+    MEMORY_CAPACITY=("1" "2" "4" "16" "32" "64" "128")
+
+    BENCHES=("dqn" "ppo" "dimenet" "llava-single" "rlhf-single" "rlhf-gpus" "vjepa-single" "vjepa-gpus")
     #
     #   Run the benchmakrs
-    milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS
+    for BENCH in "${BENCHES[@]}"; do
+        for CAPACITY in "${MEMORY_CAPACITY[@]}"; do
+            export MILABENCH_SIZER_AUTO=1
+            export MILABENCH_SIZER_BATCH_SIZE=$CAPACITY
+            milabench run --run-name "$BENCH.bs$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml --select $BENCH --exclude lightning-gpus
+        done
+    done
 
     #
     #   Display report