From a60a3aae21e87e46bcce403620a3f56c12878554 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 6 Nov 2024 22:52:12 -0500 Subject: [PATCH] Revert "explore scaling" This reverts commit 226e45557df307910231a4a7e3ce4a26119106e3. --- config/scaling.og.yaml | 622 ----------------------------------------- config/scaling.yaml | 443 +++++++++++++++++++++++++---- milabench/common.py | 2 +- pyproject.toml | 5 - run_scaling.sh | 63 ----- scale.py | 235 ---------------- scale.sh | 33 --- 7 files changed, 397 insertions(+), 1006 deletions(-) delete mode 100644 config/scaling.og.yaml delete mode 100755 run_scaling.sh delete mode 100644 scale.py delete mode 100755 scale.sh diff --git a/config/scaling.og.yaml b/config/scaling.og.yaml deleted file mode 100644 index d9d3dbf9e..000000000 --- a/config/scaling.og.yaml +++ /dev/null @@ -1,622 +0,0 @@ -bert-fp16: - arg: --batch-size - model: - 1: 4108.75 MiB - 2: 475.0 MiB - 4: 1840.375 MiB - 8: 8614.75 MiB - 16: 475.0 MiB - 32: 24604.75 MiB - 40: 34157.9375 MiB - 64: 47212.375 MiB - 80: 57619.9375 MiB - 96: 69812.375 MiB - 112: 81140.75 MiB - optimized: 128 -bert-fp32: - arg: --batch-size - model: - 1: 4206.75 MiB - 2: 475.0 MiB - 4: 6652.375 MiB - 8: 10240.75 MiB - 16: 475.0 MiB - 24: 28007.9375 MiB - 32: 31568.75 MiB - 64: 61196.375 MiB - 80: 76034.75 MiB - optimized: 128 -bert-tf32: - arg: --batch-size - model: - 1: 4204.75 MiB - 2: 475.0 MiB - 4: 6654.375 MiB - 8: 10242.75 MiB - 16: 475.0 MiB - 24: 28009.9375 MiB - 32: 31570.75 MiB - 64: 61198.375 MiB - 80: 76036.75 MiB - optimized: 128 -bert-tf32-fp16: - arg: --batch-size - model: - 1: 4108.75 MiB - 2: 475.0 MiB - 4: 1840.375 MiB - 8: 8614.75 MiB - 16: 475.0 MiB - 32: 24604.75 MiB - 40: 34157.9375 MiB - 64: 47212.375 MiB - 80: 57619.9375 MiB - 96: 69812.375 MiB - 112: 81140.75 MiB - optimized: 128 -bf16: {} -brax: - arg: --batch-size - model: - 1024: 4912.25 MiB -cleanrljax: - arg: --num_steps - optimized: 128 -convnext_large-fp16: - arg: --batch-size - model: - 1: 3228.75 MiB - 2: 3844.375 MiB - 8: 4726.75 MiB - 16: 6254.75 MiB - 32: 9418.75 MiB - 40: 10940.75 MiB - 64: 15238.75 MiB - 128: 27466.75 MiB - 144: 34449.9375 MiB - 256: 51768.375 MiB - 288: 57966.375 MiB - 304: 65001.9375 MiB - 384: 76248.375 MiB - 416: 80628.75 MiB - optimized: 128 -convnext_large-fp32: - arg: --batch-size - model: - 1: 3268.75 MiB - 2: 3480.375 MiB - 4: 2060.75 MiB - 8: 5824.75 MiB - 16: 8774.75 MiB - 32: 14548.75 MiB - 64: 26274.75 MiB - 72: 31731.9375 MiB - 128: 47312.375 MiB - 160: 58608.375 MiB - 192: 70002.375 MiB - 216: 80694.75 MiB - optimized: 128 -convnext_large-tf32: - arg: --batch-size - model: - 1: 3268.75 MiB - 2: 3480.375 MiB - 8: 5824.75 MiB - 16: 1768.75 MiB - 32: 14548.75 MiB - 64: 26274.75 MiB - 72: 33081.9375 MiB - 128: 49678.375 MiB - 160: 61560.375 MiB - 192: 73548.375 MiB - 216: 80694.75 MiB - optimized: 128 -convnext_large-tf32-fp16: - arg: --batch-size - model: - 1: 3228.75 MiB - 2: 3844.375 MiB - 8: 4726.75 MiB - 16: 6254.75 MiB - 32: 9418.75 MiB - 40: 10940.75 MiB - 64: 15238.75 MiB - 128: 27466.75 MiB - 144: 34449.9375 MiB - 256: 51768.375 MiB - 288: 57966.375 MiB - 304: 61102.375 MiB - 384: 76248.375 MiB - 416: 80628.75 MiB - optimized: 128 -davit_large: - arg: --batch-size - model: - 1: 4882.75 MiB - 8: 6330.75 MiB - 16: 8216.75 MiB - 24: 10182.75 MiB - 32: 12240.75 MiB - 40: 16213.9375 MiB - 64: 19422.75 MiB - 96: 23732.375 MiB - 104: 29025.9375 MiB - 128: 34492.75 MiB - 224: 50258.375 MiB - 240: 57341.9375 MiB - 256: 56762.375 MiB - 288: 63488.375 MiB - 328: 81502.75 MiB - optimized: 128 -davit_large-multi: - arg: --batch-size - model: - 1: 4862.75 MiB - 8: 6330.75 MiB - 16: 8216.75 MiB - 24: 10730.75 MiB - 32: 12240.75 MiB - 40: 18087.9375 MiB - 64: 19422.75 MiB - 96: 25380.375 MiB - 104: 31099.9375 MiB - 128: 34248.75 MiB - 224: 52542.375 MiB - 240: 59663.9375 MiB - 256: 59080.375 MiB - 288: 65910.375 MiB - 328: 81742.75 MiB - optimized: 128 -diffusion-gpus: - arg: --batch_size - model: - 1: 23082 MiB - 2: 21818.75 MiB - 4: 23478.75 MiB - 8: 26500.75 MiB - 16: 36436.75 MiB - 32: 57808 MiB - 48: 80698 MiB - optimized: 32 -diffusion-nodes: - arg: --batch_size - model: - 1: 21686.75 MiB - 2: 21930.75 MiB - 4: 23510.75 MiB - 16: 40054.25 MiB - 32: 61512.25 MiB -diffusion-single: - arg: --batch_size - model: - 1: 21654.75 MiB - 2: 21818.75 MiB - 4: 23478.75 MiB - 16: 33850.25 MiB - 32: 55354.25 MiB -dimenet: - arg: --batch-size - model: - 2: 452.6875 MiB - 4: 1604.25 MiB - 24: 4776.25 MiB - 56: 6330.25 MiB - 64: 12274.25 MiB - 112: 15294.25 MiB - 128: 13002.25 MiB - 240: 67506.25 MiB - 280: 56556.25 MiB - 488: 80406.25 MiB -dinov2-giant-gpus: - arg: train.batch_size_per_gpu={batch_size} - model: - 1: 32240.25 MiB - 2: 32252.25 MiB - 4: 32404.25 MiB - 16: 38350.25 MiB - 24: 48856.25 MiB - 32: 72102.25 MiB - optimized: 32 -dinov2-giant-nodes: - arg: train.batch_size_per_gpu={batch_size} -dinov2-giant-single: - arg: train.batch_size_per_gpu={batch_size} - model: - 1: 20682.25 MiB - 2: 20682.25 MiB - 4: 20682.25 MiB - 16: 52748.25 MiB - 24: 60792.25 MiB - 32: 74544.25 MiB -dlrm: {} -dqn: - arg: --buffer_batch_size - model: - 1024: 81.81005859375 MiB - 2048: 83.40380859375 MiB - 32768: 131.21630859375 MiB - 65536: 182.21630859375 MiB - optimized: 128 -focalnet: - arg: --batch-size - model: - 1: 3128.75 MiB - 4: 3320.375 MiB - 8: 4368.75 MiB - 16: 5608.75 MiB - 24: 10291.9375 MiB - 32: 8566.75 MiB - 40: 9850.75 MiB - 64: 14750.75 MiB - 128: 26398.75 MiB - 144: 29995.9375 MiB - 256: 44272.375 MiB - 288: 49730.375 MiB - 312: 56993.9375 MiB - 384: 66894.375 MiB - 424: 81368.75 MiB - optimized: 128 -fp16: {} -fp32: {} -lightning: - arg: --batch-size - model: - 1: 1054.25 MiB - 2: 1054.25 MiB - 4: 1856.25 MiB - 16: 4728.25 MiB - 24: 5482.25 MiB - 32: 6352.25 MiB - 56: 1054.25 MiB - 64: 1856.25 MiB - 120: 14522.25 MiB - 128: 14818.25 MiB - 240: 25480.25 MiB - 488: 49042.25 MiB - 664: 65914.25 MiB -lightning-gpus: - arg: --batch-size - model: - 1: 4542 MiB - 2: 1158.75 MiB - 4: 1156.75 MiB - 8: 1260.75 MiB - 16: 4150.75 MiB - 48: 11056.25 MiB - 112: 16776.25 MiB - 128: 15858 MiB - 240: 28942.25 MiB - 504: 54100.25 MiB - 624: 65386.25 MiB - optimized: 16 -llama: {} -llava-gpus: - arg: --batch_size - optimized: 1 -llava-single: - arg: --batch_size - model: - 1: 72614.25 MiB - 2: 15168.25 MiB - 4: 72362.25 MiB - optimized: 1 -llm-full-mp-gpus: - arg: batch_size={batch_size} - model: - 1: 48964.25 MiB - 2: 49214.25 MiB - 4: 51310.25 MiB - 16: 81536.25 MiB -llm-full-mp-nodes: - arg: batch_size={batch_size} - model: - 1: 37340.25 MiB - 2: 38112.25 MiB - 4: 39110.25 MiB - 16: 80638.25 MiB -llm-lora-ddp-gpus: - arg: batch_size={batch_size} - model: - 1: 12418.75 MiB - 2: 19026.25 MiB - 4: 25464.25 MiB - 16: 55834.25 MiB - 32: 80268.25 MiB -llm-lora-ddp-nodes: - arg: batch_size={batch_size} - model: - 2: 17202.25 MiB - 4: 23956.25 MiB - 16: 59730.25 MiB - 32: 68932.25 MiB -llm-lora-mp-gpus: - arg: batch_size={batch_size} - model: - 2: 38166.25 MiB - 4: 43464.25 MiB - 16: 77116.25 MiB -llm-lora-single: - arg: batch_size={batch_size} - model: - 1: 23196.75 MiB - 2: 27694.75 MiB - 16: 45076.75 MiB -opt-1_3b: - arg: --per_gpu_batch_size - model: - 1: 38102.375 MiB - optimized: 1 -opt-1_3b-multinode: - arg: --per_gpu_batch_size - model: - 1: 42126 MiB - optimized: 1 -opt-6_7b: - arg: --per_gpu_batch_size -opt-6_7b-multinode: - arg: --per_gpu_batch_size - model: - 1: 55380 MiB - optimized: 1 -pna: - arg: --batch-size - model: - 4096: 39554.25 MiB -ppo: - arg: --num_steps - model: - 8: 80.791748046875 MiB - 16: 80.916748046875 MiB - 32: 81.166748046875 MiB - 64: 81.666748046875 MiB - 128: 82.666748046875 MiB - 1024: 96.666748046875 MiB - 2048: 132.484619140625 MiB - 4096: 205.328369140625 MiB - 2517448: 62094.25 MiB - optimized: 32 -recursiongfn: - arg: --batch_size - model: - 2: 1134.75 MiB - 4: 1140.75 MiB - 16: 1830.25 MiB - 32: 1342.25 MiB - 64: 4410.25 MiB - 128: 9160.25 MiB -reformer: - arg: --batch-size - model: - 1: 1916.75 MiB - 4: 3004.375 MiB - 8: 4512.75 MiB - 16: 7082.75 MiB - 24: 10470.75 MiB - 32: 13454.75 MiB - 64: 25408.75 MiB - 72: 32287.9375 MiB - 128: 49276.375 MiB - 160: 61212.375 MiB - 192: 73148.375 MiB - 208: 79120.75 MiB - optimized: 128 -regnet_y_128gf: - arg: --batch-size - model: - 1: 6876.75 MiB - 2: 475.0 MiB - 4: 9062.375 MiB - 8: 8524.75 MiB - 16: 1234.75 MiB - 24: 18523.9375 MiB - 32: 18324.75 MiB - 56: 31165.9375 MiB - 64: 31558.75 MiB - 128: 54094.375 MiB - 136: 61245.9375 MiB - 160: 64990.375 MiB - 184: 78714.75 MiB - optimized: 128 -resnet152: - arg: --batch-size - model: - 1: 2710.75 MiB - 8: 3298.75 MiB - 16: 4164.75 MiB - 32: 6202.75 MiB - 40: 9819.9375 MiB - 64: 10120.75 MiB - 72: 10860.75 MiB - 96: 11546.375 MiB - 104: 16105.9375 MiB - 128: 18076.75 MiB - 224: 24584.375 MiB - 256: 27310.375 MiB - 448: 46894.375 MiB - 472: 52891.9375 MiB - 512: 52622.375 MiB - 576: 58588.375 MiB - 640: 81354.75 MiB - optimized: 128 -resnet152-ddp: - arg: --batch-size -resnet152-ddp-gpus: - arg: --batch-size - model: - 1: 2084.75 MiB - 2: 2122.75 MiB - 4: 2260.75 MiB -resnet152-multi: - arg: --batch-size - model: - 1: 2600.75 MiB - 8: 3374.75 MiB - 16: 4148.75 MiB - 32: 6374.75 MiB - 40: 11181.9375 MiB - 64: 10338.75 MiB - 72: 10582.75 MiB - 96: 13170.375 MiB - 104: 17773.9375 MiB - 128: 18104.75 MiB - 224: 27566.375 MiB - 256: 29176.375 MiB - 448: 50024.375 MiB - 472: 56233.9375 MiB - 512: 55924.375 MiB - 576: 62102.375 MiB - 640: 81820.75 MiB - optimized: 128 -resnet50: - arg: --batch-size - model: - 1: 1962.75 MiB - 8: 2134.75 MiB - 16: 2460.75 MiB - 32: 3206.75 MiB - 40: 7439.9375 MiB - 64: 4734.75 MiB - 96: 6478.375 MiB - 112: 11103.9375 MiB - 128: 8242.75 MiB - 184: 11072.75 MiB - 256: 14854.75 MiB - 264: 19031.9375 MiB - 512: 27900.75 MiB - 544: 29358.375 MiB - 560: 34017.9375 MiB - 1024: 53806.375 MiB - 1152: 60310.375 MiB - 1440: 74694.375 MiB - 1552: 81146.75 MiB - 1560: 81590.75 MiB - optimized: 64 -resnet50-noio: - arg: --batch-size - model: - 1: 1594.25 MiB - 2: 1652.25 MiB - 4: 1854.25 MiB - 16: 3052.25 MiB - 32: 4690.25 MiB - 56: 7114.25 MiB - 136: 15194.25 MiB - 288: 30632.25 MiB - 592: 64483.8125 MiB - 736: 76050.25 MiB -rlhf-gpus: - arg: --per_device_train_batch_size - model: - 1: 13448.25 MiB - 2: 13594.25 MiB - 4: 13686.25 MiB - 16: 14606.25 MiB - 32: 17918.25 MiB - 64: 24374.25 MiB - 128: 25830.25 MiB - 136: 29442.25 MiB - 392: 15372.25 MiB - 520: 15808.25 MiB - optimized: 64 -rlhf-single: - arg: --per_device_train_batch_size - model: - 1: 8590.25 MiB - 2: 8650.25 MiB - 4: 8822.25 MiB - 16: 9694.25 MiB - 32: 12952.25 MiB - 40: 14638.25 MiB - 64: 19422.25 MiB - 120: 31048.25 MiB - 128: 32442.25 MiB - 280: 63262.25 MiB - 352: 77536.25 MiB - optimized: 64 -rwkv: - arg: --micro_bsz - model: - 1: 3602.75 MiB - 8: 4530.75 MiB - 16: 5594.75 MiB - 64: 11452.75 MiB - 128: 19448.75 MiB - 632: 81880.75 MiB - optimized: 16 -stargan: - arg: --batch_size - model: - 1: 37896.75 MiB - 8: 19165.75 MiB - 16: 62478.375 MiB - 32: 73824.75 MiB - optimized: 16 -super-slomo: - arg: --train_batch_size - model: - 1: 3016.75 MiB - 2: 3506.75 MiB - 4: 5884.375 MiB - 8: 10288.75 MiB - 16: 16914.75 MiB - 24: 29777.9375 MiB - 32: 33934.375 MiB - 56: 61837.9375 MiB - 64: 66072.375 MiB - 80: 81180.75 MiB - optimized: 32 -t5: - arg: --batch-size - model: - 1: 4396.75 MiB - 2: 6384.375 MiB - 4: 10620.375 MiB - 8: 18684.75 MiB - 16: 33990.75 MiB - 24: 54479.9375 MiB - 32: 66760.375 MiB - optimized: 128 -tf32: {} -torchatari: - arg: --num-steps - model: - 1: 1124.75 MiB - 1024: 20176.25 MiB - 2048: 39020.25 MiB - 4096: 76708.25 MiB -vjepa-gpus: - arg: --batch_size - model: - 1: 27196.25 MiB - 2: 28896.25 MiB - 4: 30784.25 MiB - 16: 52722.25 MiB - 32: 77124.25 MiB - optimized: 24 -vjepa-single: - arg: --batch_size - model: - 1: 6644.25 MiB - 2: 18984.25 MiB - 4: 11860.25 MiB - 8: 30764.25 MiB - 16: 45516.25 MiB - 24: 57574.25 MiB - 32: 67122.25 MiB - optimized: 24 -whisper: - arg: --batch-size - model: - 1: 2070.75 MiB - 4: 3828.375 MiB - 8: 6108.75 MiB - 16: 10540.75 MiB - 24: 18887.9375 MiB - 32: 19282.75 MiB - 48: 31841.9375 MiB - 64: 36728.75 MiB - 96: 54086.375 MiB - 104: 62409.9375 MiB - 128: 71634.375 MiB - 144: 80412.75 MiB - optimized: 128 diff --git a/config/scaling.yaml b/config/scaling.yaml index 8b918c26b..d9d3dbf9e 100644 --- a/config/scaling.yaml +++ b/config/scaling.yaml @@ -2,28 +2,63 @@ bert-fp16: arg: --batch-size model: 1: 4108.75 MiB - optimized: 1 + 2: 475.0 MiB + 4: 1840.375 MiB + 8: 8614.75 MiB + 16: 475.0 MiB + 32: 24604.75 MiB + 40: 34157.9375 MiB + 64: 47212.375 MiB + 80: 57619.9375 MiB + 96: 69812.375 MiB + 112: 81140.75 MiB + optimized: 128 bert-fp32: arg: --batch-size model: 1: 4206.75 MiB - optimized: 1 + 2: 475.0 MiB + 4: 6652.375 MiB + 8: 10240.75 MiB + 16: 475.0 MiB + 24: 28007.9375 MiB + 32: 31568.75 MiB + 64: 61196.375 MiB + 80: 76034.75 MiB + optimized: 128 bert-tf32: arg: --batch-size model: 1: 4204.75 MiB - optimized: 1 + 2: 475.0 MiB + 4: 6654.375 MiB + 8: 10242.75 MiB + 16: 475.0 MiB + 24: 28009.9375 MiB + 32: 31570.75 MiB + 64: 61198.375 MiB + 80: 76036.75 MiB + optimized: 128 bert-tf32-fp16: arg: --batch-size model: 1: 4108.75 MiB - optimized: 1 + 2: 475.0 MiB + 4: 1840.375 MiB + 8: 8614.75 MiB + 16: 475.0 MiB + 32: 24604.75 MiB + 40: 34157.9375 MiB + 64: 47212.375 MiB + 80: 57619.9375 MiB + 96: 69812.375 MiB + 112: 81140.75 MiB + optimized: 128 bf16: {} brax: arg: --batch-size model: 1024: 4912.25 MiB - optimized: 1 cleanrljax: arg: --num_steps optimized: 128 @@ -31,87 +66,229 @@ convnext_large-fp16: arg: --batch-size model: 1: 3228.75 MiB - optimized: 1 + 2: 3844.375 MiB + 8: 4726.75 MiB + 16: 6254.75 MiB + 32: 9418.75 MiB + 40: 10940.75 MiB + 64: 15238.75 MiB + 128: 27466.75 MiB + 144: 34449.9375 MiB + 256: 51768.375 MiB + 288: 57966.375 MiB + 304: 65001.9375 MiB + 384: 76248.375 MiB + 416: 80628.75 MiB + optimized: 128 convnext_large-fp32: arg: --batch-size model: 1: 3268.75 MiB - optimized: 1 + 2: 3480.375 MiB + 4: 2060.75 MiB + 8: 5824.75 MiB + 16: 8774.75 MiB + 32: 14548.75 MiB + 64: 26274.75 MiB + 72: 31731.9375 MiB + 128: 47312.375 MiB + 160: 58608.375 MiB + 192: 70002.375 MiB + 216: 80694.75 MiB + optimized: 128 convnext_large-tf32: arg: --batch-size model: 1: 3268.75 MiB - optimized: 1 + 2: 3480.375 MiB + 8: 5824.75 MiB + 16: 1768.75 MiB + 32: 14548.75 MiB + 64: 26274.75 MiB + 72: 33081.9375 MiB + 128: 49678.375 MiB + 160: 61560.375 MiB + 192: 73548.375 MiB + 216: 80694.75 MiB + optimized: 128 convnext_large-tf32-fp16: arg: --batch-size model: 1: 3228.75 MiB - optimized: 1 + 2: 3844.375 MiB + 8: 4726.75 MiB + 16: 6254.75 MiB + 32: 9418.75 MiB + 40: 10940.75 MiB + 64: 15238.75 MiB + 128: 27466.75 MiB + 144: 34449.9375 MiB + 256: 51768.375 MiB + 288: 57966.375 MiB + 304: 61102.375 MiB + 384: 76248.375 MiB + 416: 80628.75 MiB + optimized: 128 davit_large: arg: --batch-size model: 1: 4882.75 MiB - optimized: 1 + 8: 6330.75 MiB + 16: 8216.75 MiB + 24: 10182.75 MiB + 32: 12240.75 MiB + 40: 16213.9375 MiB + 64: 19422.75 MiB + 96: 23732.375 MiB + 104: 29025.9375 MiB + 128: 34492.75 MiB + 224: 50258.375 MiB + 240: 57341.9375 MiB + 256: 56762.375 MiB + 288: 63488.375 MiB + 328: 81502.75 MiB + optimized: 128 davit_large-multi: arg: --batch-size model: 1: 4862.75 MiB - optimized: 1 + 8: 6330.75 MiB + 16: 8216.75 MiB + 24: 10730.75 MiB + 32: 12240.75 MiB + 40: 18087.9375 MiB + 64: 19422.75 MiB + 96: 25380.375 MiB + 104: 31099.9375 MiB + 128: 34248.75 MiB + 224: 52542.375 MiB + 240: 59663.9375 MiB + 256: 59080.375 MiB + 288: 65910.375 MiB + 328: 81742.75 MiB + optimized: 128 diffusion-gpus: arg: --batch_size model: 1: 23082 MiB - optimized: 1 + 2: 21818.75 MiB + 4: 23478.75 MiB + 8: 26500.75 MiB + 16: 36436.75 MiB + 32: 57808 MiB + 48: 80698 MiB + optimized: 32 diffusion-nodes: arg: --batch_size model: 1: 21686.75 MiB - optimized: 1 + 2: 21930.75 MiB + 4: 23510.75 MiB + 16: 40054.25 MiB + 32: 61512.25 MiB diffusion-single: arg: --batch_size model: 1: 21654.75 MiB - optimized: 1 + 2: 21818.75 MiB + 4: 23478.75 MiB + 16: 33850.25 MiB + 32: 55354.25 MiB dimenet: arg: --batch-size model: 2: 452.6875 MiB - optimized: 1 + 4: 1604.25 MiB + 24: 4776.25 MiB + 56: 6330.25 MiB + 64: 12274.25 MiB + 112: 15294.25 MiB + 128: 13002.25 MiB + 240: 67506.25 MiB + 280: 56556.25 MiB + 488: 80406.25 MiB dinov2-giant-gpus: arg: train.batch_size_per_gpu={batch_size} model: 1: 32240.25 MiB - optimized: 1 + 2: 32252.25 MiB + 4: 32404.25 MiB + 16: 38350.25 MiB + 24: 48856.25 MiB + 32: 72102.25 MiB + optimized: 32 dinov2-giant-nodes: arg: train.batch_size_per_gpu={batch_size} dinov2-giant-single: arg: train.batch_size_per_gpu={batch_size} model: 1: 20682.25 MiB - optimized: 1 + 2: 20682.25 MiB + 4: 20682.25 MiB + 16: 52748.25 MiB + 24: 60792.25 MiB + 32: 74544.25 MiB dlrm: {} dqn: arg: --buffer_batch_size model: 1024: 81.81005859375 MiB - optimized: 1 + 2048: 83.40380859375 MiB + 32768: 131.21630859375 MiB + 65536: 182.21630859375 MiB + optimized: 128 focalnet: arg: --batch-size model: 1: 3128.75 MiB - optimized: 1 + 4: 3320.375 MiB + 8: 4368.75 MiB + 16: 5608.75 MiB + 24: 10291.9375 MiB + 32: 8566.75 MiB + 40: 9850.75 MiB + 64: 14750.75 MiB + 128: 26398.75 MiB + 144: 29995.9375 MiB + 256: 44272.375 MiB + 288: 49730.375 MiB + 312: 56993.9375 MiB + 384: 66894.375 MiB + 424: 81368.75 MiB + optimized: 128 fp16: {} fp32: {} lightning: arg: --batch-size model: 1: 1054.25 MiB - optimized: 1 + 2: 1054.25 MiB + 4: 1856.25 MiB + 16: 4728.25 MiB + 24: 5482.25 MiB + 32: 6352.25 MiB + 56: 1054.25 MiB + 64: 1856.25 MiB + 120: 14522.25 MiB + 128: 14818.25 MiB + 240: 25480.25 MiB + 488: 49042.25 MiB + 664: 65914.25 MiB lightning-gpus: arg: --batch-size model: 1: 4542 MiB - optimized: 1 + 2: 1158.75 MiB + 4: 1156.75 MiB + 8: 1260.75 MiB + 16: 4150.75 MiB + 48: 11056.25 MiB + 112: 16776.25 MiB + 128: 15858 MiB + 240: 28942.25 MiB + 504: 54100.25 MiB + 624: 65386.25 MiB + optimized: 16 llama: {} llava-gpus: arg: --batch_size @@ -120,37 +297,50 @@ llava-single: arg: --batch_size model: 1: 72614.25 MiB + 2: 15168.25 MiB + 4: 72362.25 MiB optimized: 1 llm-full-mp-gpus: arg: batch_size={batch_size} model: 1: 48964.25 MiB - optimized: 1 + 2: 49214.25 MiB + 4: 51310.25 MiB + 16: 81536.25 MiB llm-full-mp-nodes: arg: batch_size={batch_size} model: 1: 37340.25 MiB - optimized: 1 + 2: 38112.25 MiB + 4: 39110.25 MiB + 16: 80638.25 MiB llm-lora-ddp-gpus: arg: batch_size={batch_size} model: 1: 12418.75 MiB - optimized: 1 + 2: 19026.25 MiB + 4: 25464.25 MiB + 16: 55834.25 MiB + 32: 80268.25 MiB llm-lora-ddp-nodes: arg: batch_size={batch_size} model: 2: 17202.25 MiB - optimized: 1 + 4: 23956.25 MiB + 16: 59730.25 MiB + 32: 68932.25 MiB llm-lora-mp-gpus: arg: batch_size={batch_size} model: 2: 38166.25 MiB - optimized: 1 + 4: 43464.25 MiB + 16: 77116.25 MiB llm-lora-single: arg: batch_size={batch_size} model: 1: 23196.75 MiB - optimized: 1 + 2: 27694.75 MiB + 16: 45076.75 MiB opt-1_3b: arg: --per_gpu_batch_size model: @@ -172,102 +362,261 @@ pna: arg: --batch-size model: 4096: 39554.25 MiB - optimized: 1 ppo: arg: --num_steps model: 8: 80.791748046875 MiB - optimized: 1 + 16: 80.916748046875 MiB + 32: 81.166748046875 MiB + 64: 81.666748046875 MiB + 128: 82.666748046875 MiB + 1024: 96.666748046875 MiB + 2048: 132.484619140625 MiB + 4096: 205.328369140625 MiB + 2517448: 62094.25 MiB + optimized: 32 recursiongfn: arg: --batch_size model: 2: 1134.75 MiB - optimized: 1 + 4: 1140.75 MiB + 16: 1830.25 MiB + 32: 1342.25 MiB + 64: 4410.25 MiB + 128: 9160.25 MiB reformer: arg: --batch-size model: 1: 1916.75 MiB - optimized: 1 + 4: 3004.375 MiB + 8: 4512.75 MiB + 16: 7082.75 MiB + 24: 10470.75 MiB + 32: 13454.75 MiB + 64: 25408.75 MiB + 72: 32287.9375 MiB + 128: 49276.375 MiB + 160: 61212.375 MiB + 192: 73148.375 MiB + 208: 79120.75 MiB + optimized: 128 regnet_y_128gf: arg: --batch-size model: 1: 6876.75 MiB - optimized: 1 + 2: 475.0 MiB + 4: 9062.375 MiB + 8: 8524.75 MiB + 16: 1234.75 MiB + 24: 18523.9375 MiB + 32: 18324.75 MiB + 56: 31165.9375 MiB + 64: 31558.75 MiB + 128: 54094.375 MiB + 136: 61245.9375 MiB + 160: 64990.375 MiB + 184: 78714.75 MiB + optimized: 128 resnet152: arg: --batch-size model: 1: 2710.75 MiB - optimized: 1 + 8: 3298.75 MiB + 16: 4164.75 MiB + 32: 6202.75 MiB + 40: 9819.9375 MiB + 64: 10120.75 MiB + 72: 10860.75 MiB + 96: 11546.375 MiB + 104: 16105.9375 MiB + 128: 18076.75 MiB + 224: 24584.375 MiB + 256: 27310.375 MiB + 448: 46894.375 MiB + 472: 52891.9375 MiB + 512: 52622.375 MiB + 576: 58588.375 MiB + 640: 81354.75 MiB + optimized: 128 resnet152-ddp: arg: --batch-size resnet152-ddp-gpus: arg: --batch-size model: 1: 2084.75 MiB - optimized: 1 + 2: 2122.75 MiB + 4: 2260.75 MiB resnet152-multi: arg: --batch-size model: 1: 2600.75 MiB - optimized: 1 + 8: 3374.75 MiB + 16: 4148.75 MiB + 32: 6374.75 MiB + 40: 11181.9375 MiB + 64: 10338.75 MiB + 72: 10582.75 MiB + 96: 13170.375 MiB + 104: 17773.9375 MiB + 128: 18104.75 MiB + 224: 27566.375 MiB + 256: 29176.375 MiB + 448: 50024.375 MiB + 472: 56233.9375 MiB + 512: 55924.375 MiB + 576: 62102.375 MiB + 640: 81820.75 MiB + optimized: 128 resnet50: arg: --batch-size model: 1: 1962.75 MiB - optimized: 1 + 8: 2134.75 MiB + 16: 2460.75 MiB + 32: 3206.75 MiB + 40: 7439.9375 MiB + 64: 4734.75 MiB + 96: 6478.375 MiB + 112: 11103.9375 MiB + 128: 8242.75 MiB + 184: 11072.75 MiB + 256: 14854.75 MiB + 264: 19031.9375 MiB + 512: 27900.75 MiB + 544: 29358.375 MiB + 560: 34017.9375 MiB + 1024: 53806.375 MiB + 1152: 60310.375 MiB + 1440: 74694.375 MiB + 1552: 81146.75 MiB + 1560: 81590.75 MiB + optimized: 64 resnet50-noio: arg: --batch-size model: 1: 1594.25 MiB - optimized: 1 + 2: 1652.25 MiB + 4: 1854.25 MiB + 16: 3052.25 MiB + 32: 4690.25 MiB + 56: 7114.25 MiB + 136: 15194.25 MiB + 288: 30632.25 MiB + 592: 64483.8125 MiB + 736: 76050.25 MiB rlhf-gpus: arg: --per_device_train_batch_size model: 1: 13448.25 MiB - optimized: 1 + 2: 13594.25 MiB + 4: 13686.25 MiB + 16: 14606.25 MiB + 32: 17918.25 MiB + 64: 24374.25 MiB + 128: 25830.25 MiB + 136: 29442.25 MiB + 392: 15372.25 MiB + 520: 15808.25 MiB + optimized: 64 rlhf-single: arg: --per_device_train_batch_size model: 1: 8590.25 MiB - optimized: 1 + 2: 8650.25 MiB + 4: 8822.25 MiB + 16: 9694.25 MiB + 32: 12952.25 MiB + 40: 14638.25 MiB + 64: 19422.25 MiB + 120: 31048.25 MiB + 128: 32442.25 MiB + 280: 63262.25 MiB + 352: 77536.25 MiB + optimized: 64 rwkv: arg: --micro_bsz model: 1: 3602.75 MiB - optimized: 1 + 8: 4530.75 MiB + 16: 5594.75 MiB + 64: 11452.75 MiB + 128: 19448.75 MiB + 632: 81880.75 MiB + optimized: 16 stargan: arg: --batch_size model: 1: 37896.75 MiB - optimized: 1 + 8: 19165.75 MiB + 16: 62478.375 MiB + 32: 73824.75 MiB + optimized: 16 super-slomo: arg: --train_batch_size model: 1: 3016.75 MiB - optimized: 1 + 2: 3506.75 MiB + 4: 5884.375 MiB + 8: 10288.75 MiB + 16: 16914.75 MiB + 24: 29777.9375 MiB + 32: 33934.375 MiB + 56: 61837.9375 MiB + 64: 66072.375 MiB + 80: 81180.75 MiB + optimized: 32 t5: arg: --batch-size model: 1: 4396.75 MiB - optimized: 1 + 2: 6384.375 MiB + 4: 10620.375 MiB + 8: 18684.75 MiB + 16: 33990.75 MiB + 24: 54479.9375 MiB + 32: 66760.375 MiB + optimized: 128 tf32: {} torchatari: arg: --num-steps model: 1: 1124.75 MiB - optimized: 1 + 1024: 20176.25 MiB + 2048: 39020.25 MiB + 4096: 76708.25 MiB vjepa-gpus: arg: --batch_size model: 1: 27196.25 MiB - optimized: 1 + 2: 28896.25 MiB + 4: 30784.25 MiB + 16: 52722.25 MiB + 32: 77124.25 MiB + optimized: 24 vjepa-single: arg: --batch_size model: 1: 6644.25 MiB - optimized: 1 + 2: 18984.25 MiB + 4: 11860.25 MiB + 8: 30764.25 MiB + 16: 45516.25 MiB + 24: 57574.25 MiB + 32: 67122.25 MiB + optimized: 24 whisper: arg: --batch-size model: 1: 2070.75 MiB - optimized: 1 + 4: 3828.375 MiB + 8: 6108.75 MiB + 16: 10540.75 MiB + 24: 18887.9375 MiB + 32: 19282.75 MiB + 48: 31841.9375 MiB + 64: 36728.75 MiB + 96: 54086.375 MiB + 104: 62409.9375 MiB + 128: 71634.375 MiB + 144: 80412.75 MiB + optimized: 128 diff --git a/milabench/common.py b/milabench/common.py index 2c756183b..b533f892f 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -94,7 +94,7 @@ def get_multipack(args = None, run_name=None, overrides={}): args = arguments() override = [ - o if re.match(pattern=r"[.\w_-]+=", string=o) else f"={o}" for o in args.override + o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override ] override.extend( diff --git a/pyproject.toml b/pyproject.toml index 2e0a4f7b2..0d4a6d62d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,8 +67,3 @@ combine_as_imports = true [tool.poetry-git-version-plugin] alpha_version_format = '{version}a{distance}+{commit_hash}' - -[tool.hatch.envs.default] -python = "3.10" -python-sources = ["external"] -installer = "pip" diff --git a/run_scaling.sh b/run_scaling.sh deleted file mode 100755 index 37842b537..000000000 --- a/run_scaling.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -for gpu in a100 a100l l40s rtx8000 v100 h100 -do -# squeue -u$USER -o "%A %j" | grep " mb-scale-${gpu}-1$" || \ -# sbatch --gpus-per-task=${gpu}:1 --cpus-per-task=6 --job-name "mb-scale-${gpu}-1" scale.sh "${gpu}" - squeue -u$USER -o "%A %j" | grep "mb-scale-${gpu}-4$" || \ - sbatch --gpus-per-task=${gpu}:4 --job-name "mb-scale-${gpu}-4" scale.sh "${gpu}" -done - -gpu=h100 -# while true -# do -# squeue -u$USER -o "%j" | grep "^mb-scale-${gpu}-1$" && sleep 5m || \ -# { sbatch --wait --time=3:0:0 --partition short-unkillable --gpus-per-task=${gpu}:1 --cpus-per-task=6 --job-name "mb-scale-${gpu}-1" scale.sh "${gpu}" \ -# && break ; } -# done - -while true -do - squeue -u$USER -h -n "mb-scale-${gpu}-4" -o"%L" | grep "" >/dev/null || \ - sbatch --time=3:0:0 --partition short-unkillable --gpus-per-task=${gpu}:4 --job-name "mb-scale-${gpu}-4" scale.sh "${gpu}" - - { date ; squeue -u$USER -n "mb-scale-${gpu}-4" ; } | timeout --foreground 5m less && break - - time_left=$(squeue -u$USER -h -n "mb-scale-${gpu}-4" -o"%L") - d=$(echo $time_left | cut -d"-" -f1) - hms=$(echo $time_left | cut -d"-" -f2) - - # echo $time_left - # echo $d - # echo $hms - - if [[ "$d" != "$hms" ]] - then - # some days left - continue - fi - - h=$(echo $hms | cut -d":" -f1) - m=$(echo $hms | cut -d":" -f2) - s=$(echo $hms | cut -d":" -f3) - - # echo $h - # echo $m - # echo $s - - if [[ -z "$s" ]] - then - s=$m - m=$h - h= - fi - - # echo $h - # echo $m - # echo $s - - if [[ -z "$h" ]] && [[ "$m" -lt "15" ]] - then - scontrol requeue $(squeue -u$USER -h -n "mb-scale-${gpu}-4" -o"%A") - fi -done diff --git a/scale.py b/scale.py deleted file mode 100644 index 66b2ab067..000000000 --- a/scale.py +++ /dev/null @@ -1,235 +0,0 @@ -from datetime import datetime -import logging -import os -from pathlib import Path -import signal -import subprocess -import sys -import time -from types import FrameType -import yaml - -logging.basicConfig(level=logging.DEBUG) - -gpu = subprocess.run( - [ - "bash", "-c", 'nvidia-smi -i 0 -q | grep "Product Name" | cut -d":" -f2' - ], - capture_output=True, - check=True, - encoding="utf8" -).stdout.strip() -gpu = gpu.replace(" ", "-") -gpu = gpu.replace("_", "-") - -if not gpu: - gpu = gpu or os.environ["MILABENCH_GPU"] - logging.warning(f"Could not find gpu using nvidia-smi. Using {gpu}") - - -def run(name:str, batch_size:int): - run_dir = Path(os.environ["MILABENCH_BASE"]) / f"runs/{gpu}_{name}_{batch_size}" - staging_dir = run_dir.parent / f"{run_dir.name}.staging" - - argv = sys.argv[1:] + [ - "--select", name, "--run-name", staging_dir.name, - "--override", f"{name}.plan.method=njobs", - "--override", f"{name}.plan.n=1" - ] - - if run_dir.exists(): - logging.info( - f"({batch_size}) Found existing run dir {run_dir}. Not executing " - f"{argv}" - ) - return True - - failed = [str(_d) for _d in run_dir.parent.glob(f"{run_dir.name}.failed_*")] - if len(failed) > 2: - logging.warning( - f"({batch_size}) Failed more than 2 times ({len(failed)}) {failed}. " - f"Not executing {argv}" - ) - return False - - returncode = None - cleanup = None - try: - logging.debug( - f"({batch_size}) Executing {argv}" - ) - p = subprocess.Popen( - args=argv, - env={ - **os.environ, - "MILABENCH_SIZER_BATCH_SIZE": str(batch_size), - "MILABENCH_SIZER_SAVE": f"config/scaling_{gpu}.yaml", - }, - ) - - def cleanup(signum: int, frame: FrameType | None): - if signum is not None: - logging.info(f"Received signal {signum}") - p.send_signal(signum) - - if p.poll() == 0: - logging.info(f"Execution succeeded {argv}") - if staging_dir.exists(): - staging_dir.rename(run_dir) - logging.info(f"Renamed {staging_dir} to {run_dir}") - else: - logging.error(f"Execution failed {argv}") - if staging_dir.exists(): - staging_dir.rename(staging_dir.with_suffix(f".failed_{datetime.now()}".replace(" ", "-"))) - - # signal.signal(signal.SIGTERM, cleanup) - # signal.signal(signal.SIGUSR1, cleanup) - - p.wait() - - returncode = p.poll() - - finally: - cleanup(None, None) - - return returncode == 0 - - -def check_batch_size(name:str, batch_size:int, scaling:Path, force=False): - scaling = get_scaling(scaling).get(name, {}) - - if "arg" not in scaling: - logging.warning( - f"({batch_size}) No scaling argument found for {name}. Not " - f"executing {sys.argv[1:]}" - ) - return False - - if not force and batch_size in scaling.get("model", {}): - logging.info( - f"({batch_size}) Found batch size {batch_size} in {name}'s scaling " - f"config. Not executing {sys.argv[1:]}" - ) - return True - - return run(name, batch_size) or batch_size in scaling.get("model", {}) - - -def round_even(number:int): - # Round to next even number - if int(number + 0.5) == 1: - return 1 - _ = int(number / 2 + 0.5) - int(number / 2) - return (int(number / 2) + _) * 2 - - -def test_round_even(): - for i in range(100): - print([(i, i:=round_even(i / 2)) for _ in range(100) if i > 1]) - - -def get_scaling(scaling_file:Path): - start_scaling = Path("config/scaling.yaml") - - retries = [True] * 5 - scaling = None - if not scaling_file.exists() and scaling_file != start_scaling: - start_scaling = get_scaling(start_scaling) - for name, conf in start_scaling.items(): - if isinstance(conf, dict) and "model" in conf: - first = sorted(conf["model"].items())[:1] - del conf["model"] - conf["model"] = { - k: v for k, v in first - } - if start_scaling: - scaling_file.write_text(yaml.dump(start_scaling)) - - while retries and not scaling: - logging.debug(f"Scaling in {scaling_file} is {scaling}") - time.sleep(5) - retries.pop() - scaling = yaml.safe_load(scaling_file.read_text()) or {} - return scaling - - -enabled = set() -for name, conf in yaml.safe_load(Path("config/standard.yaml").read_text()).items(): - if isinstance(conf, dict) and conf["enabled"]: - enabled.add(name) - -for name, conf in yaml.safe_load(Path("config/base.yaml").read_text()).items(): - if name in enabled and "multinode" in conf.get("tags", []): - enabled.remove(name) - -for name, conf in get_scaling(Path("config/scaling.og.yaml")).items(): - if name in enabled: - lower_batch_size = 1 - for batch_size in conf.get("model", {}): - while lower_batch_size * 2 <= batch_size: - lower_batch_size = lower_batch_size * 2 - - # MILABENCH_BASE="${MILABENCH_BASE:-$SCRATCH/data/milabench}" MILABENCH_CONFIG="${PWD}/config/standard.yaml" MILABENCH_SIZER_BATCH_SIZE=180 MILABENCH_SIZER_SAVE=config/scaling.yaml hatch run milabench run --system config/cloud-system.yaml.slurm__a100l_x4 --select bert-fp16 - - # find lower bound batch_size - # as long as the test passes, try to find a bigger batch_size - batch_size = None - while ( - check_batch_size(name, lower_batch_size * 2, Path(f"config/scaling_{gpu}.yaml")) - ): - batch_size = lower_batch_size - lower_batch_size = lower_batch_size * 2 - - logging.info(f"Found lower bound {lower_batch_size} for {name} in scaling") - - # In case the gpu doesn't support the highest recorded batch_size, find - # the lower bound for the current gpu - while ( - not batch_size and - not check_batch_size(name, lower_batch_size, Path(f"config/scaling_{gpu}.yaml")) and - lower_batch_size > 1 - ): - lower_batch_size = round_even(lower_batch_size / 2) - - upper_batch_size = lower_batch_size * 2 - - while ( - upper_batch_size > lower_batch_size and - # If we get a 5% difference between lower and upper we consider to - # have explored all possible scaling batch size - (upper_batch_size - lower_batch_size) / upper_batch_size >= 0.025 - ): - logging.info(f"Sweeping for upper bound between {lower_batch_size} and {upper_batch_size} for {name}") - - batch_size = round_even(lower_batch_size + (upper_batch_size - lower_batch_size) / 2) - result = check_batch_size(name, batch_size, Path(f"config/scaling_{gpu}.yaml")) - - if result: - lower_batch_size = batch_size - elif upper_batch_size == batch_size: - assert upper_batch_size - lower_batch_size <= 2 - break - else: - upper_batch_size = batch_size - - if not lower_batch_size: - logging.warning(f"Failed to find a batch size for bench {name}") - continue - - upper_batch_size = lower_batch_size - lower_batch_size = lower_batch_size / 2 - - while lower_batch_size <= upper_batch_size: - logging.info(f"Sweeping batch sizes from {lower_batch_size} to {upper_batch_size} for {name}") - - lower_batch_size = round_even(lower_batch_size) - - check_batch_size(name, lower_batch_size, Path(f"config/scaling_{gpu}.yaml")) - - if lower_batch_size == upper_batch_size: - break - - lower_batch_size += int((upper_batch_size - lower_batch_size) / 2) - - if (upper_batch_size - lower_batch_size) / upper_batch_size <= 0.025: - lower_batch_size = upper_batch_size diff --git a/scale.sh b/scale.sh deleted file mode 100755 index b36352966..000000000 --- a/scale.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=20 -#SBATCH --time=7-0:0:0 -#SBATCH --mem=350000 -#SBATCH --output=%x.%j.out - -exit_script() { - echo "Preemption signal $1, saving myself ${SLURM_JOB_ID}" - trap - $1 # clear the trap - # Optional: sends SIGTERM to child/sub processes - kill -s $1 -- -$$ & - sleep 5 - scontrol requeue ${SLURM_JOB_ID} -} - -kill_script() { - echo "Kill signal $1" -} - -trap "exit_script SIGTERM" SIGTERM -trap "exit_script SIGUSR1" SIGUSR1 - -export MILABENCH_BASE="${SCRATCH}/data/milabench" -export MILABENCH_CONFIG="${PWD}/config/standard.yaml" -export MILABENCH_SIZER_SAVE="${PWD}/config/scaling.yaml" -export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="upb" -SELECT= -EXCLUDE="--exclude multinode" - -MILABENCH_GPU="${1}" python3 scale.py hatch run milabench run $SELECT $EXCLUDE