From 397f127b6f5c7b5583f6184b64f5fb54d8a16b9d Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Tue, 17 Sep 2024 23:20:39 -0400 Subject: [PATCH] Add scaling config --- benchmarks/brax/main.py | 3 + benchmarks/brax/voirfile.py | 4 +- benchmarks/lightning/main.py | 8 +- benchmarks/torchvision/voirfile.py | 2 +- benchmate/benchmate/monitor.py | 4 +- config/base.yaml | 8 ++ config/scaling.yaml | 115 +++++++++++++++++++---------- milabench/cli/list.py | 9 ++- scripts/article/run_cuda.sh | 18 ++++- 9 files changed, 121 insertions(+), 50 deletions(-) diff --git a/benchmarks/brax/main.py b/benchmarks/brax/main.py index 572ce739c..6625bcd04 100644 --- a/benchmarks/brax/main.py +++ b/benchmarks/brax/main.py @@ -85,6 +85,9 @@ def run(): args = parser.parse_args() + # args.num_envs = (args.batch_size * args.num_minibatches) + + train( environment=envs.get_environment(env_name=args.env), num_timesteps=args.num_timesteps, diff --git a/benchmarks/brax/voirfile.py b/benchmarks/brax/voirfile.py index fce6f66d0..3397dcb31 100644 --- a/benchmarks/brax/voirfile.py +++ b/benchmarks/brax/voirfile.py @@ -20,10 +20,10 @@ class Config: skip: int = 5 # Number of rates to log before stopping - stop: int = 20 + stop: int = 60 # Number of seconds between each gpu poll - gpu_poll: int = 3 + gpu_poll: int = 1 @configurable diff --git a/benchmarks/lightning/main.py b/benchmarks/lightning/main.py index b31f3880c..aca89ee47 100644 --- a/benchmarks/lightning/main.py +++ b/benchmarks/lightning/main.py @@ -40,7 +40,7 @@ def prepare_voir(): observer = BenchObserver( accelerator.Event, - earlystop=65, + earlystop=100, batch_size_fn=lambda x: len(x[0]), raise_stop_program=False, stdout=True, @@ -73,8 +73,6 @@ def main(): model = TorchvisionLightning(model) - - accelerator.set_enable_tf32(True) observer, monitor = prepare_voir() @@ -91,10 +89,10 @@ def main(): enable_checkpointing=False, enable_progress_bar=False, reload_dataloaders_every_n_epochs=1, - max_steps=100 + max_steps=120 ) - with monitor(): + with monitor(poll_interval=0.1): trainer.fit(model=model, train_dataloaders=loader) print("finished: ", rank) diff --git a/benchmarks/torchvision/voirfile.py b/benchmarks/torchvision/voirfile.py index ed3f0af7c..a05c99774 100644 --- a/benchmarks/torchvision/voirfile.py +++ b/benchmarks/torchvision/voirfile.py @@ -24,7 +24,7 @@ class Config: stop: int = 20 # Number of seconds between each gpu poll - gpu_poll: int = 3 + gpu_poll: float = 1 @configurable diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 5d2624201..0fe8fe025 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -17,7 +17,7 @@ @instrument_definition -def monitor_monogpu(ov, poll_interval=10, arch=None): +def monitor_monogpu(ov, poll_interval=1, arch=None): return monitor( ov, poll_interval=poll_interval, @@ -28,7 +28,7 @@ def monitor_monogpu(ov, poll_interval=10, arch=None): @instrument_definition -def monitor_node(ov, poll_interval=10, arch=None): +def monitor_node(ov, poll_interval=1, arch=None): return monitor( ov, poll_interval=poll_interval, diff --git a/config/base.yaml b/config/base.yaml index dd69a4954..730ef78d8 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -209,6 +209,11 @@ resnet50: resnet50-noio: inherits: _torchvision + voir: + options: + stop: 1000 + interval: "1s" + tags: - vision - classification @@ -372,12 +377,15 @@ focalnet: --model: focalnet_base_lrf brax: + # Brax requires very specific sizes to work + # so the resizer is not capable of handling resizing this bench inherits: _defaults tags: - rl - jax - multigpu - gym + - nobatch definition: ../benchmarks/brax group: brax install_group: torch diff --git a/config/scaling.yaml b/config/scaling.yaml index 9b1dc36eb..5fb30494e 100644 --- a/config/scaling.yaml +++ b/config/scaling.yaml @@ -55,9 +55,10 @@ bert-tf32-fp16: 112: 81140.75 MiB optimized: 128 bf16: {} -brax: - args: --batch-size - +brax: + arg: --batch-size + model: + 1024: 4912.25 MiB convnext_large-fp16: arg: --batch-size model: @@ -191,14 +192,26 @@ dimenet: dinov2-giant-gpus: arg: train.batch_size_per_gpu={batch_size} model: + 1: 32240.25 MiB + 2: 32252.25 MiB + 4: 32404.25 MiB + 16: 38350.25 MiB 32: 69614 MiB optimized: 32 -dinov2-giant-single: - arg: train.batch_size_per_gpu={batch_size} dinov2-giant-nodes: arg: train.batch_size_per_gpu={batch_size} - +dinov2-giant-single: + arg: train.batch_size_per_gpu={batch_size} + model: + 1: 20682.25 MiB + 2: 20682.25 MiB + 4: 20682.25 MiB + 16: 52748.25 MiB + 32: 74544.25 MiB dlrm: {} +dqn: + arg: --buffer_batch_size + optimized: 128 focalnet: arg: --batch-size model: @@ -222,6 +235,14 @@ fp16: {} fp32: {} lightning: arg: --batch-size + model: + 1: 1054.25 MiB + 2: 1054.25 MiB + 4: 1856.25 MiB + 16: 4728.25 MiB + 32: 6352.25 MiB + 64: 1856.25 MiB + 128: 14818.25 MiB lightning-gpus: arg: --batch-size model: @@ -233,18 +254,47 @@ lightning-gpus: 128: 15858 MiB optimized: 16 llama: {} +llava-gpus: + arg: --batch_size + optimized: 1 +llava-single: + arg: --batch_size + optimized: 1 llm-full-mp-gpus: arg: batch_size={batch_size} + model: + 1: 48964.25 MiB + 2: 49214.25 MiB + 4: 51310.25 MiB + 16: 81536.25 MiB llm-full-mp-nodes: arg: batch_size={batch_size} + model: + 1: 37340.25 MiB + 2: 38112.25 MiB + 4: 39110.25 MiB + 16: 80638.25 MiB llm-lora-ddp-gpus: arg: batch_size={batch_size} model: 1: 12418.75 MiB + 2: 19026.25 MiB + 4: 25464.25 MiB + 16: 55834.25 MiB + 32: 80268.25 MiB llm-lora-ddp-nodes: arg: batch_size={batch_size} + model: + 2: 17202.25 MiB + 4: 23956.25 MiB + 16: 59730.25 MiB + 32: 68932.25 MiB llm-lora-mp-gpus: arg: batch_size={batch_size} + model: + 2: 38166.25 MiB + 4: 43464.25 MiB + 16: 77116.25 MiB llm-lora-single: arg: batch_size={batch_size} model: @@ -268,6 +318,9 @@ opt-6_7b-multinode: model: 1: 55380 MiB optimized: 1 +ppo: + arg: --num_minibatches + optimized: 32 recursiongfn: arg: --batch_size model: @@ -382,6 +435,18 @@ resnet50: optimized: 64 resnet50-noio: arg: --batch-size + model: + 1: 1594.25 MiB + 2: 1652.25 MiB + 4: 1854.25 MiB + 16: 3052.25 MiB + 32: 4690.25 MiB +rlhf-gpus: + arg: --per_device_train_batch_size + optimized: 64 +rlhf-single: + arg: --per_device_train_batch_size + optimized: 64 rwkv: arg: --micro_bsz model: @@ -432,6 +497,12 @@ torchatari: 1: 1124.75 MiB 2: 1138.75 MiB 4: 1166.75 MiB +vjepa-gpus: + arg: --batch_size + optimized: 24 +vjepa-single: + arg: --batch_size + optimized: 24 whisper: arg: --batch-size model: @@ -448,35 +519,3 @@ whisper: 128: 71634.375 MiB 144: 80412.75 MiB optimized: 128 - -llava-single: - arg: --batch_size - optimized: 1 - -llava-gpus: - arg: --batch_size - optimized: 1 - -rlhf-single: - arg: --per_device_train_batch_size - optimized: 64 - -rlhf-gpus: - arg: --per_device_train_batch_size - optimized: 64 - -vjepa-single: - arg: --batch_size - optimized: 24 - -vjepa-gpus: - arg: --batch_size - optimized: 24 - -ppo: - arg: --num_minibatches - optimized: 32 - -dqn: - arg: --buffer_batch_size - optimized: 128 \ No newline at end of file diff --git a/milabench/cli/list.py b/milabench/cli/list.py index bfba35f9c..fda73bdf5 100644 --- a/milabench/cli/list.py +++ b/milabench/cli/list.py @@ -42,7 +42,14 @@ def add_bench(k, tags): else: add_bench(k, tags) - print(",".join(missing_benches)) + + + b = [f"\"{b}\"" for b in missing_benches] + + + + + print(" ".join(b)) if __name__ == "__main__": diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index b7b31eed3..99ebae544 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -84,14 +84,30 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then . $MILABENCH_WORDIR/env/bin/activate + # milabench install --system $MILABENCH_WORDIR/system.yaml + # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS # pip install torch # milabench pin --variant cuda --from-scratch $ARGS # milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS + ARGS="--select resnet50-noio,brax,lightning,dinov2-giant-single,dinov2-giant-gpus,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-full-mp-gpus,llm-full-mp-nodes,dqn,ppo,dimenet,llava-single,rlhf-single,rlhf-gpus,vjepa-single,vjepa-gpus" + # MEMORY_CAPACITY=("4Go" "8Go" "16Go" "32Go" "64Go" "80Go") + # MILABENCH_SIZER_MULTIPLE=16 + # MILABENCH_SIZER_CAPACITY="$CAPACITY" + + MEMORY_CAPACITY=("1" "2" "4" "16" "32" "64" "128") + + BENCHES=("dqn" "ppo" "dimenet" "llava-single" "rlhf-single" "rlhf-gpus" "vjepa-single" "vjepa-gpus") # # Run the benchmakrs - milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS + for BENCH in "${BENCHES[@]}"; do + for CAPACITY in "${MEMORY_CAPACITY[@]}"; do + export MILABENCH_SIZER_AUTO=1 + export MILABENCH_SIZER_BATCH_SIZE=$CAPACITY + milabench run --run-name "$BENCH.bs$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml --select $BENCH --exclude lightning-gpus + done + done # # Display report