Skip to content

Commit

Permalink
Update cmd regression
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Aug 27, 2024
1 parent 67df463 commit d60a9b4
Show file tree
Hide file tree
Showing 7 changed files with 340 additions and 184 deletions.
112 changes: 75 additions & 37 deletions config/scaling.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ bert-fp16:
arg: --batch-size
model:
1: 4108.75 MiB
2: 475.0 MiB
4: 1840.375 MiB
8: 8614.75 MiB
16: 14254.75 MiB
16: 475.0 MiB
32: 24604.75 MiB
40: 34157.9375 MiB
64: 47212.375 MiB
Expand All @@ -16,9 +17,10 @@ bert-fp32:
arg: --batch-size
model:
1: 4206.75 MiB
2: 475.0 MiB
4: 6652.375 MiB
8: 10240.75 MiB
16: 17646.75 MiB
16: 475.0 MiB
24: 28007.9375 MiB
32: 31568.75 MiB
64: 61196.375 MiB
Expand All @@ -28,9 +30,10 @@ bert-tf32:
arg: --batch-size
model:
1: 4204.75 MiB
2: 475.0 MiB
4: 6654.375 MiB
8: 10242.75 MiB
16: 17648.75 MiB
16: 475.0 MiB
24: 28009.9375 MiB
32: 31570.75 MiB
64: 61198.375 MiB
Expand All @@ -40,9 +43,10 @@ bert-tf32-fp16:
arg: --batch-size
model:
1: 4108.75 MiB
2: 475.0 MiB
4: 1840.375 MiB
8: 8614.75 MiB
16: 14254.75 MiB
16: 475.0 MiB
32: 24604.75 MiB
40: 34157.9375 MiB
64: 47212.375 MiB
Expand Down Expand Up @@ -75,6 +79,7 @@ convnext_large-fp32:
model:
1: 3268.75 MiB
2: 3480.375 MiB
4: 2060.75 MiB
8: 5824.75 MiB
16: 8774.75 MiB
32: 14548.75 MiB
Expand All @@ -91,7 +96,7 @@ convnext_large-tf32:
1: 3268.75 MiB
2: 3480.375 MiB
8: 5824.75 MiB
16: 8774.75 MiB
16: 1768.75 MiB
32: 14548.75 MiB
64: 26274.75 MiB
72: 33081.9375 MiB
Expand Down Expand Up @@ -156,6 +161,29 @@ davit_large-multi:
288: 65910.375 MiB
328: 81742.75 MiB
optimized: 128
diffusion-gpus:
arg: --batch_size
model:
1: 23082 MiB
2: 21818.75 MiB
4: 23478.75 MiB
8: 26500.75 MiB
16: 36436.75 MiB
32: 57808 MiB
48: 80698 MiB
optimized: 32
diffusion-nodes:
arg: --batch_size
diffusion-single:
arg: --batch_size
dimenet: {}
dinov2-giant-gpus:
arg: train.batch_size_per_gpu={batch_size}
model:
32: 69614 MiB
optimized: 32
dinov2-giant-single:
arg: train.batch_size_per_gpu={batch_size}
dlrm: {}
focalnet:
arg: --batch-size
Expand All @@ -178,7 +206,31 @@ focalnet:
optimized: 128
fp16: {}
fp32: {}
lightning:
arg: --batch-size
lightning-gpus:
arg: --batch-size
model:
1: 4542 MiB
2: 1158.75 MiB
4: 1156.75 MiB
8: 1260.75 MiB
16: 4150.75 MiB
128: 15858 MiB
optimized: 16
llama: {}
llm-full-mp-gpus:
arg: batch_size={batch_size}
llm-full-mp-nodes:
arg: batch_size={batch_size}
llm-lora-ddp-gpus:
arg: batch_size={batch_size}
llm-lora-ddp-nodes:
arg: batch_size={batch_size}
llm-lora-mp-gpus:
arg: batch_size={batch_size}
llm-lora-single:
arg: batch_size={batch_size}
opt-1_3b:
arg: --per_gpu_batch_size
model:
Expand All @@ -189,19 +241,22 @@ opt-1_3b-multinode:
model:
1: 42126 MiB
optimized: 1
opt-6_7b: {}
opt-6_7b:
arg: --per_gpu_batch_size
opt-6_7b-multinode:
arg: --per_gpu_batch_size
model:
1: 55380 MiB
optimized: 1
recursiongfn:
arg: --batch_size
reformer:
arg: --batch-size
model:
1: 1916.75 MiB
4: 3004.375 MiB
8: 4512.75 MiB
16: 7486.75 MiB
16: 7082.75 MiB
24: 10470.75 MiB
32: 13454.75 MiB
64: 25408.75 MiB
Expand All @@ -215,9 +270,10 @@ regnet_y_128gf:
arg: --batch-size
model:
1: 6876.75 MiB
2: 475.0 MiB
4: 9062.375 MiB
8: 8524.75 MiB
16: 11426.75 MiB
16: 1234.75 MiB
24: 18523.9375 MiB
32: 18324.75 MiB
56: 31165.9375 MiB
Expand Down Expand Up @@ -248,7 +304,10 @@ resnet152:
576: 58588.375 MiB
640: 81354.75 MiB
optimized: 128
resnet152-ddp: {}
resnet152-ddp:
arg: --batch-size
resnet152-ddp-gpus:
arg: --batch-size
resnet152-multi:
arg: --batch-size
model:
Expand Down Expand Up @@ -294,7 +353,8 @@ resnet50:
1552: 81146.75 MiB
1560: 81590.75 MiB
optimized: 64
resnet50-noio: {}
resnet50-noio:
arg: --batch-size
rwkv:
arg: --micro_bsz
model:
Expand All @@ -317,9 +377,10 @@ super-slomo:
arg: --train_batch_size
model:
1: 3016.75 MiB
2: 3506.75 MiB
4: 5884.375 MiB
8: 10288.75 MiB
16: 18718.75 MiB
16: 16914.75 MiB
24: 29777.9375 MiB
32: 33934.375 MiB
56: 61837.9375 MiB
Expand All @@ -333,11 +394,13 @@ t5:
2: 6384.375 MiB
4: 10620.375 MiB
8: 18684.75 MiB
16: 35448.75 MiB
16: 33990.75 MiB
24: 54479.9375 MiB
32: 66760.375 MiB
optimized: 128
tf32: {}
torchatari:
arg: --num-steps
whisper:
arg: --batch-size
model:
Expand All @@ -354,28 +417,3 @@ whisper:
128: 71634.375 MiB
144: 80412.75 MiB
optimized: 128


diffusion-gpus:
arg: --batch_size
model:
1: 23082 MiB
16: 37778 MiB
32: 57808 MiB
48: 80698 MiB
optimized: 32


lightning-gpus:
arg: --batch-size
model:
1: 4542 MiB
16: 5692 MiB
128: 15858 MiB
optimized: 16

dinov2-giant-gpus:
arg: train.batch_size_per_gpu={batch_size}
model:
32: 69614 MiB
optimized: 32
6 changes: 3 additions & 3 deletions milabench/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""This file is generated, do not modify"""

__tag__ = "v0.1.0-42-gab24a2dc"
__commit__ = "ab24a2dc21c2f22c40d85bfa935e0ab07ae30dd9"
__date__ = "2024-08-20 10:02:58 -0400"
__tag__ = "v0.1.0-42-g67df463d"
__commit__ = "67df463d99bbee50087e84cbbdb547f55739f35e"
__date__ = "2024-08-23 10:29:13 -0400"
8 changes: 7 additions & 1 deletion milabench/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,13 @@ def _resolve_addresses(nodes):

def gethostname(host):
try:
return subprocess.check_output(["ssh", host, "cat", "/etc/hostname"], text=True).strip()
# "-oCheckHostIP=no",
# "-oPasswordAuthentication=no",
return subprocess.check_output([
"ssh",
"-oCheckHostIP=no",
"-oPasswordAuthentication=no",
"-oStrictHostKeyChecking=no", host, "cat", "/etc/hostname"], text=True).strip()
except:
print("Could not resolve hostname")
return host
Expand Down
10 changes: 7 additions & 3 deletions scripts/article/run_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ else
export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml"
fi

ARGS="$@"

install_prepare() {
mkdir -p $MILABENCH_WORDIR
cd $MILABENCH_WORDIR
Expand All @@ -38,10 +40,12 @@ install_prepare() {

pip install -e $MILABENCH_SOURCE

milabench slurm_system > $MILABENCH_WORDIR/system.yaml

#
# Install milabench's benchmarks in their venv
#
milabench install "$@"
milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS

which pip

Expand All @@ -58,7 +62,7 @@ install_prepare() {

#
# Generate/download datasets, download models etc...
milabench prepare "$@"
milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
}

module load cuda/12.3.2
Expand All @@ -76,7 +80,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then

#
# Run the benchmakrs
milabench run "$@"
milabench run --system $MILABENCH_WORDIR/system.yaml "$@"

#
# Display report
Expand Down
33 changes: 33 additions & 0 deletions scripts/article/run_update_batch_size.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@




export MILABENCH_SIZER_AUTO=1
export MILABENCH_SIZER_BATCH_SIZE=1
FINAL_OUTPUT="$HOME/batch_x_worker"
export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama

export MILABENCH_SIZER_AUTO=1
export MILABENCH_SIZER_BATCH_SIZE=2
FINAL_OUTPUT="$HOME/batch_x_worker"
export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama

export MILABENCH_SIZER_AUTO=1
export MILABENCH_SIZER_BATCH_SIZE=4
FINAL_OUTPUT="$HOME/batch_x_worker"
export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama

export MILABENCH_SIZER_AUTO=1
export MILABENCH_SIZER_BATCH_SIZE=8
FINAL_OUTPUT="$HOME/batch_x_worker"
export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama

export MILABENCH_SIZER_AUTO=1
export MILABENCH_SIZER_BATCH_SIZE=16
FINAL_OUTPUT="$HOME/batch_x_worker"
export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml"
milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama
Loading

0 comments on commit d60a9b4

Please sign in to comment.