Update cmd regression

mila-iqia · Aug 27, 2024 · d60a9b4 · d60a9b4
1 parent 67df463
commit d60a9b4
Show file tree

Hide file tree

Showing 7 changed files with 340 additions and 184 deletions.
diff --git a/config/scaling.yaml b/config/scaling.yaml
@@ -2,9 +2,10 @@ bert-fp16:
   arg: --batch-size
   model:
     1: 4108.75 MiB
+    2: 475.0 MiB
     4: 1840.375 MiB
     8: 8614.75 MiB
-    16: 14254.75 MiB
+    16: 475.0 MiB
     32: 24604.75 MiB
     40: 34157.9375 MiB
     64: 47212.375 MiB
@@ -16,9 +17,10 @@ bert-fp32:
   arg: --batch-size
   model:
     1: 4206.75 MiB
+    2: 475.0 MiB
     4: 6652.375 MiB
     8: 10240.75 MiB
-    16: 17646.75 MiB
+    16: 475.0 MiB
     24: 28007.9375 MiB
     32: 31568.75 MiB
     64: 61196.375 MiB
@@ -28,9 +30,10 @@ bert-tf32:
   arg: --batch-size
   model:
     1: 4204.75 MiB
+    2: 475.0 MiB
     4: 6654.375 MiB
     8: 10242.75 MiB
-    16: 17648.75 MiB
+    16: 475.0 MiB
     24: 28009.9375 MiB
     32: 31570.75 MiB
     64: 61198.375 MiB
@@ -40,9 +43,10 @@ bert-tf32-fp16:
   arg: --batch-size
   model:
     1: 4108.75 MiB
+    2: 475.0 MiB
     4: 1840.375 MiB
     8: 8614.75 MiB
-    16: 14254.75 MiB
+    16: 475.0 MiB
     32: 24604.75 MiB
     40: 34157.9375 MiB
     64: 47212.375 MiB
@@ -75,6 +79,7 @@ convnext_large-fp32:
   model:
     1: 3268.75 MiB
     2: 3480.375 MiB
+    4: 2060.75 MiB
     8: 5824.75 MiB
     16: 8774.75 MiB
     32: 14548.75 MiB
@@ -91,7 +96,7 @@ convnext_large-tf32:
     1: 3268.75 MiB
     2: 3480.375 MiB
     8: 5824.75 MiB
-    16: 8774.75 MiB
+    16: 1768.75 MiB
     32: 14548.75 MiB
     64: 26274.75 MiB
     72: 33081.9375 MiB
@@ -156,6 +161,29 @@ davit_large-multi:
     288: 65910.375 MiB
     328: 81742.75 MiB
   optimized: 128
+diffusion-gpus:
+  arg: --batch_size
+  model:
+    1: 23082 MiB
+    2: 21818.75 MiB
+    4: 23478.75 MiB
+    8: 26500.75 MiB
+    16: 36436.75 MiB
+    32: 57808 MiB
+    48: 80698 MiB
+  optimized: 32
+diffusion-nodes:
+  arg: --batch_size
+diffusion-single:
+  arg: --batch_size
+dimenet: {}
+dinov2-giant-gpus:
+  arg: train.batch_size_per_gpu={batch_size}
+  model:
+    32: 69614 MiB
+  optimized: 32
+dinov2-giant-single:
+  arg: train.batch_size_per_gpu={batch_size}
 dlrm: {}
 focalnet:
   arg: --batch-size
@@ -178,7 +206,31 @@ focalnet:
   optimized: 128
 fp16: {}
 fp32: {}
+lightning:
+  arg: --batch-size
+lightning-gpus:
+  arg: --batch-size
+  model:
+    1: 4542 MiB
+    2: 1158.75 MiB
+    4: 1156.75 MiB
+    8: 1260.75 MiB
+    16: 4150.75 MiB
+    128: 15858 MiB
+  optimized: 16
 llama: {}
+llm-full-mp-gpus:
+  arg: batch_size={batch_size}
+llm-full-mp-nodes:
+  arg: batch_size={batch_size}
+llm-lora-ddp-gpus:
+  arg: batch_size={batch_size}
+llm-lora-ddp-nodes:
+  arg: batch_size={batch_size}
+llm-lora-mp-gpus:
+  arg: batch_size={batch_size}
+llm-lora-single:
+  arg: batch_size={batch_size}
 opt-1_3b:
   arg: --per_gpu_batch_size
   model:
@@ -189,19 +241,22 @@ opt-1_3b-multinode:
   model:
     1: 42126 MiB
   optimized: 1
-opt-6_7b: {}
+opt-6_7b:
+  arg: --per_gpu_batch_size
 opt-6_7b-multinode:
   arg: --per_gpu_batch_size
   model:
     1: 55380 MiB
   optimized: 1
+recursiongfn:
+  arg: --batch_size
 reformer:
   arg: --batch-size
   model:
     1: 1916.75 MiB
     4: 3004.375 MiB
     8: 4512.75 MiB
-    16: 7486.75 MiB
+    16: 7082.75 MiB
     24: 10470.75 MiB
     32: 13454.75 MiB
     64: 25408.75 MiB
@@ -215,9 +270,10 @@ regnet_y_128gf:
   arg: --batch-size
   model:
     1: 6876.75 MiB
+    2: 475.0 MiB
     4: 9062.375 MiB
     8: 8524.75 MiB
-    16: 11426.75 MiB
+    16: 1234.75 MiB
     24: 18523.9375 MiB
     32: 18324.75 MiB
     56: 31165.9375 MiB
@@ -248,7 +304,10 @@ resnet152:
     576: 58588.375 MiB
     640: 81354.75 MiB
   optimized: 128
-resnet152-ddp: {}
+resnet152-ddp:
+  arg: --batch-size
+resnet152-ddp-gpus:
+  arg: --batch-size
 resnet152-multi:
   arg: --batch-size
   model:
@@ -294,7 +353,8 @@ resnet50:
     1552: 81146.75 MiB
     1560: 81590.75 MiB
   optimized: 64
-resnet50-noio: {}
+resnet50-noio:
+  arg: --batch-size
 rwkv:
   arg: --micro_bsz
   model:
@@ -317,9 +377,10 @@ super-slomo:
   arg: --train_batch_size
   model:
     1: 3016.75 MiB
+    2: 3506.75 MiB
     4: 5884.375 MiB
     8: 10288.75 MiB
-    16: 18718.75 MiB
+    16: 16914.75 MiB
     24: 29777.9375 MiB
     32: 33934.375 MiB
     56: 61837.9375 MiB
@@ -333,11 +394,13 @@ t5:
     2: 6384.375 MiB
     4: 10620.375 MiB
     8: 18684.75 MiB
-    16: 35448.75 MiB
+    16: 33990.75 MiB
     24: 54479.9375 MiB
     32: 66760.375 MiB
   optimized: 128
 tf32: {}
+torchatari:
+  arg: --num-steps
 whisper:
   arg: --batch-size
   model:
@@ -354,28 +417,3 @@ whisper:
     128: 71634.375 MiB
     144: 80412.75 MiB
   optimized: 128
-
-
-diffusion-gpus:
-  arg: --batch_size
-  model:
-    1: 23082 MiB
-    16: 37778 MiB
-    32: 57808 MiB
-    48: 80698 MiB
-  optimized: 32
-
-
-lightning-gpus:
-  arg: --batch-size
-  model:
-    1: 4542 MiB
-    16: 5692 MiB
-    128: 15858 MiB
-  optimized: 16
-
-dinov2-giant-gpus:
-  arg: train.batch_size_per_gpu={batch_size}
-  model:
-    32: 69614 MiB
-  optimized: 32
diff --git a/milabench/_version.py b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-42-gab24a2dc"
-__commit__ = "ab24a2dc21c2f22c40d85bfa935e0ab07ae30dd9"
-__date__ = "2024-08-20 10:02:58 -0400"
+__tag__ = "v0.1.0-42-g67df463d"
+__commit__ = "67df463d99bbee50087e84cbbdb547f55739f35e"
+__date__ = "2024-08-23 10:29:13 -0400"
diff --git a/milabench/system.py b/milabench/system.py
@@ -385,7 +385,13 @@ def _resolve_addresses(nodes):
 
 def gethostname(host):
     try:
-        return subprocess.check_output(["ssh", host, "cat", "/etc/hostname"], text=True).strip()
+        #             "-oCheckHostIP=no",
+        # "-oPasswordAuthentication=no",
+        return subprocess.check_output([
+            "ssh",  
+            "-oCheckHostIP=no", 
+            "-oPasswordAuthentication=no", 
+            "-oStrictHostKeyChecking=no", host, "cat", "/etc/hostname"], text=True).strip()
     except:
         print("Could not resolve hostname")
         return host

diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
@@ -19,6 +19,8 @@ else
     export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml"
 fi
 
+ARGS="$@"
+
 install_prepare() {
     mkdir -p $MILABENCH_WORDIR
     cd $MILABENCH_WORDIR
@@ -38,10 +40,12 @@ install_prepare() {
 
     pip install -e $MILABENCH_SOURCE
 
+    milabench slurm_system > $MILABENCH_WORDIR/system.yaml
+
     #
     # Install milabench's benchmarks in their venv
     #
-    milabench install "$@"
+    milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     which pip
 
@@ -58,7 +62,7 @@ install_prepare() {
 
     #
     #   Generate/download datasets, download models etc...
-    milabench prepare "$@"
+    milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 }
 
 module load cuda/12.3.2
@@ -76,7 +80,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
 
     #
     #   Run the benchmakrs
-    milabench run "$@"
+    milabench run --system $MILABENCH_WORDIR/system.yaml "$@"
 
     #
     #   Display report

diff --git a/scripts/article/run_update_batch_size.sh b/scripts/article/run_update_batch_size.sh
@@ -0,0 +1,33 @@
+
+
+
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=1
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=2
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=4
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=8
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
+
+export MILABENCH_SIZER_AUTO=1
+export MILABENCH_SIZER_BATCH_SIZE=16
+FINAL_OUTPUT="$HOME/batch_x_worker"
+export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
+milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama