Skip to content

Commit

Permalink
Add scaling config
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Sep 18, 2024
1 parent ca31349 commit 397f127
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 50 deletions.
3 changes: 3 additions & 0 deletions benchmarks/brax/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def run():

args = parser.parse_args()

# args.num_envs = (args.batch_size * args.num_minibatches)


train(
environment=envs.get_environment(env_name=args.env),
num_timesteps=args.num_timesteps,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/brax/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ class Config:
skip: int = 5

# Number of rates to log before stopping
stop: int = 20
stop: int = 60

# Number of seconds between each gpu poll
gpu_poll: int = 3
gpu_poll: int = 1


@configurable
Expand Down
8 changes: 3 additions & 5 deletions benchmarks/lightning/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def prepare_voir():

observer = BenchObserver(
accelerator.Event,
earlystop=65,
earlystop=100,
batch_size_fn=lambda x: len(x[0]),
raise_stop_program=False,
stdout=True,
Expand Down Expand Up @@ -73,8 +73,6 @@ def main():

model = TorchvisionLightning(model)



accelerator.set_enable_tf32(True)

observer, monitor = prepare_voir()
Expand All @@ -91,10 +89,10 @@ def main():
enable_checkpointing=False,
enable_progress_bar=False,
reload_dataloaders_every_n_epochs=1,
max_steps=100
max_steps=120
)

with monitor():
with monitor(poll_interval=0.1):
trainer.fit(model=model, train_dataloaders=loader)
print("finished: ", rank)

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/torchvision/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Config:
stop: int = 20

# Number of seconds between each gpu poll
gpu_poll: int = 3
gpu_poll: float = 1


@configurable
Expand Down
4 changes: 2 additions & 2 deletions benchmate/benchmate/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


@instrument_definition
def monitor_monogpu(ov, poll_interval=10, arch=None):
def monitor_monogpu(ov, poll_interval=1, arch=None):
return monitor(
ov,
poll_interval=poll_interval,
Expand All @@ -28,7 +28,7 @@ def monitor_monogpu(ov, poll_interval=10, arch=None):


@instrument_definition
def monitor_node(ov, poll_interval=10, arch=None):
def monitor_node(ov, poll_interval=1, arch=None):
return monitor(
ov,
poll_interval=poll_interval,
Expand Down
8 changes: 8 additions & 0 deletions config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@ resnet50:

resnet50-noio:
inherits: _torchvision
voir:
options:
stop: 1000
interval: "1s"

tags:
- vision
- classification
Expand Down Expand Up @@ -372,12 +377,15 @@ focalnet:
--model: focalnet_base_lrf

brax:
# Brax requires very specific sizes to work
# so the resizer is not capable of handling resizing this bench
inherits: _defaults
tags:
- rl
- jax
- multigpu
- gym
- nobatch
definition: ../benchmarks/brax
group: brax
install_group: torch
Expand Down
115 changes: 77 additions & 38 deletions config/scaling.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ bert-tf32-fp16:
112: 81140.75 MiB
optimized: 128
bf16: {}
brax:
args: --batch-size

brax:
arg: --batch-size
model:
1024: 4912.25 MiB
convnext_large-fp16:
arg: --batch-size
model:
Expand Down Expand Up @@ -191,14 +192,26 @@ dimenet:
dinov2-giant-gpus:
arg: train.batch_size_per_gpu={batch_size}
model:
1: 32240.25 MiB
2: 32252.25 MiB
4: 32404.25 MiB
16: 38350.25 MiB
32: 69614 MiB
optimized: 32
dinov2-giant-single:
arg: train.batch_size_per_gpu={batch_size}
dinov2-giant-nodes:
arg: train.batch_size_per_gpu={batch_size}

dinov2-giant-single:
arg: train.batch_size_per_gpu={batch_size}
model:
1: 20682.25 MiB
2: 20682.25 MiB
4: 20682.25 MiB
16: 52748.25 MiB
32: 74544.25 MiB
dlrm: {}
dqn:
arg: --buffer_batch_size
optimized: 128
focalnet:
arg: --batch-size
model:
Expand All @@ -222,6 +235,14 @@ fp16: {}
fp32: {}
lightning:
arg: --batch-size
model:
1: 1054.25 MiB
2: 1054.25 MiB
4: 1856.25 MiB
16: 4728.25 MiB
32: 6352.25 MiB
64: 1856.25 MiB
128: 14818.25 MiB
lightning-gpus:
arg: --batch-size
model:
Expand All @@ -233,18 +254,47 @@ lightning-gpus:
128: 15858 MiB
optimized: 16
llama: {}
llava-gpus:
arg: --batch_size
optimized: 1
llava-single:
arg: --batch_size
optimized: 1
llm-full-mp-gpus:
arg: batch_size={batch_size}
model:
1: 48964.25 MiB
2: 49214.25 MiB
4: 51310.25 MiB
16: 81536.25 MiB
llm-full-mp-nodes:
arg: batch_size={batch_size}
model:
1: 37340.25 MiB
2: 38112.25 MiB
4: 39110.25 MiB
16: 80638.25 MiB
llm-lora-ddp-gpus:
arg: batch_size={batch_size}
model:
1: 12418.75 MiB
2: 19026.25 MiB
4: 25464.25 MiB
16: 55834.25 MiB
32: 80268.25 MiB
llm-lora-ddp-nodes:
arg: batch_size={batch_size}
model:
2: 17202.25 MiB
4: 23956.25 MiB
16: 59730.25 MiB
32: 68932.25 MiB
llm-lora-mp-gpus:
arg: batch_size={batch_size}
model:
2: 38166.25 MiB
4: 43464.25 MiB
16: 77116.25 MiB
llm-lora-single:
arg: batch_size={batch_size}
model:
Expand All @@ -268,6 +318,9 @@ opt-6_7b-multinode:
model:
1: 55380 MiB
optimized: 1
ppo:
arg: --num_minibatches
optimized: 32
recursiongfn:
arg: --batch_size
model:
Expand Down Expand Up @@ -382,6 +435,18 @@ resnet50:
optimized: 64
resnet50-noio:
arg: --batch-size
model:
1: 1594.25 MiB
2: 1652.25 MiB
4: 1854.25 MiB
16: 3052.25 MiB
32: 4690.25 MiB
rlhf-gpus:
arg: --per_device_train_batch_size
optimized: 64
rlhf-single:
arg: --per_device_train_batch_size
optimized: 64
rwkv:
arg: --micro_bsz
model:
Expand Down Expand Up @@ -432,6 +497,12 @@ torchatari:
1: 1124.75 MiB
2: 1138.75 MiB
4: 1166.75 MiB
vjepa-gpus:
arg: --batch_size
optimized: 24
vjepa-single:
arg: --batch_size
optimized: 24
whisper:
arg: --batch-size
model:
Expand All @@ -448,35 +519,3 @@ whisper:
128: 71634.375 MiB
144: 80412.75 MiB
optimized: 128

llava-single:
arg: --batch_size
optimized: 1

llava-gpus:
arg: --batch_size
optimized: 1

rlhf-single:
arg: --per_device_train_batch_size
optimized: 64

rlhf-gpus:
arg: --per_device_train_batch_size
optimized: 64

vjepa-single:
arg: --batch_size
optimized: 24

vjepa-gpus:
arg: --batch_size
optimized: 24

ppo:
arg: --num_minibatches
optimized: 32

dqn:
arg: --buffer_batch_size
optimized: 128
9 changes: 8 additions & 1 deletion milabench/cli/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,14 @@ def add_bench(k, tags):
else:
add_bench(k, tags)

print(",".join(missing_benches))


b = [f"\"{b}\"" for b in missing_benches]




print(" ".join(b))


if __name__ == "__main__":
Expand Down
18 changes: 17 additions & 1 deletion scripts/article/run_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,30 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then

. $MILABENCH_WORDIR/env/bin/activate

# milabench install --system $MILABENCH_WORDIR/system.yaml
# milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS

# pip install torch
# milabench pin --variant cuda --from-scratch $ARGS
# milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS
ARGS="--select resnet50-noio,brax,lightning,dinov2-giant-single,dinov2-giant-gpus,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-full-mp-gpus,llm-full-mp-nodes,dqn,ppo,dimenet,llava-single,rlhf-single,rlhf-gpus,vjepa-single,vjepa-gpus"

# MEMORY_CAPACITY=("4Go" "8Go" "16Go" "32Go" "64Go" "80Go")
# MILABENCH_SIZER_MULTIPLE=16
# MILABENCH_SIZER_CAPACITY="$CAPACITY"

MEMORY_CAPACITY=("1" "2" "4" "16" "32" "64" "128")

BENCHES=("dqn" "ppo" "dimenet" "llava-single" "rlhf-single" "rlhf-gpus" "vjepa-single" "vjepa-gpus")
#
# Run the benchmakrs
milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS
for BENCH in "${BENCHES[@]}"; do
for CAPACITY in "${MEMORY_CAPACITY[@]}"; do
export MILABENCH_SIZER_AUTO=1
export MILABENCH_SIZER_BATCH_SIZE=$CAPACITY
milabench run --run-name "$BENCH.bs$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml --select $BENCH --exclude lightning-gpus
done
done

#
# Display report
Expand Down

0 comments on commit 397f127

Please sign in to comment.