From d5cbbf50dfffd05ccda970f6c3e12cd2599bf339 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 17:07:45 -0500 Subject: [PATCH] Update README --- README.md | 143 +++-- milabench/_version.py | 6 +- scripts/article/run_cuda.sh | 9 +- .../test_command_reg_one_node.txt | 604 ----------------- .../test_command_reg_two_nodes.txt | 607 ------------------ .../test_capabilities.py | 0 6 files changed, 90 insertions(+), 1279 deletions(-) delete mode 100644 tests/test_command_reg/test_command_reg_one_node.txt delete mode 100644 tests/test_command_reg/test_command_reg_two_nodes.txt rename tests/{ => test_validation}/test_capabilities.py (100%) diff --git a/README.md b/README.md index 526398938..163906d02 100644 --- a/README.md +++ b/README.md @@ -20,62 +20,23 @@ evaluating current and future hardware in a research environment. * Focussed on training * Ease of use * Pytorch focused -* ROCm & NVIDIA +* ROCm, NVIDIA, Intel OneAPI, Habana Gaudi (Synapse) * Independent ## Getting Started -The easiest way to run milabbench is to run it with one of its docker image. -It will include all of the necessary data - - - # Choose the image you want to use - export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly - - # Pull the image we are going to run - docker pull $MILABENCH_IMAGE - - # Run milabench - docker run -it --rm --ipc=host --gpus=all \ - -v $(pwd)/results:/milabench/envs/runs \ - $MILABENCH_IMAGE \ - bash -c "milabench prepare && milabench run" - - ================= - Benchmark results - ================= - fail n perf sem% std% peak_memory score weight - bert-fp16 0 8 155.08 0.3% 4.3% 24552 1241.260310 0.00 - bert-fp32 0 8 29.52 0.0% 0.5% 31524 236.337218 0.00 - bert-tf32 0 8 120.46 0.4% 6.1% 31524 964.713297 0.00 - bert-tf32-fp16 0 8 154.76 0.3% 4.1% 24552 1238.477257 3.00 - convnext_large-fp16 0 8 337.48 0.9% 14.0% 27658 2741.604444 0.00 - convnext_large-fp32 0 8 44.61 0.8% 12.6% 49786 354.207225 0.00 - convnext_large-tf32 0 8 135.99 0.7% 11.2% 49786 1089.394916 0.00 - convnext_large-tf32-fp16 0 8 338.58 0.8% 13.0% 27658 2744.325170 3.00 - davit_large 0 8 312.79 0.3% 6.7% 35058 2515.326450 1.00 - davit_large-multi 0 1 2401.65 1.0% 7.7% 42232 2401.651720 5.00 - dlrm 0 1 188777.20 1.8% 14.0% 3194 188777.203190 1.00 - focalnet 0 8 400.47 0.2% 5.4% 26604 3215.431924 2.00 - opt-1_3b 0 1 26.71 0.1% 0.4% 44116 26.714365 5.00 - opt-1_3b-multinode 0 2 34.62 0.2% 1.0% 43552 34.618292 10.00 - opt-6_7b 0 1 14.32 0.0% 0.1% 55750 14.319587 5.00 - opt-6_7b-multinode 0 2 10.79 0.1% 0.7% 49380 10.792595 10.00 - reformer 0 8 61.70 0.0% 0.9% 25376 494.110834 1.00 - regnet_y_128gf 0 8 99.96 0.2% 5.0% 31840 803.012507 2.00 - resnet152 0 8 710.18 0.3% 6.2% 36732 5710.828608 1.00 - resnet152-multi 0 1 5367.34 1.0% 8.1% 38638 5367.338469 5.00 - resnet50 0 8 984.43 0.9% 19.1% 5026 7927.257351 1.00 - rwkv 0 8 428.65 0.2% 3.8% 5546 3435.097716 1.00 - stargan 0 8 51.32 1.8% 40.8% 37848 413.238870 1.00 - super-slomo 0 8 41.63 0.1% 2.3% 34082 332.395065 1.00 - t5 0 8 48.05 0.2% 3.9% 35466 384.317023 2.00 - whisper 0 8 248.16 0.0% 0.6% 37006 1985.861017 1.00 - - Scores - ------ - Failure rate: 0.00% (PASS) - Score: 219.06 + + git clone https://github.com/mila-iqia/milabench.git + + pip install -e milabench + + export MILABENCH_GPU_ARCH=cuda + + milabench install --base workspace --config milabench/config/standard.yaml --select fp32 + + milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32 + + milabench run --base workspace --config milabench/config/standard.yaml --select fp32 ## Details @@ -84,13 +45,77 @@ The benchmark suite has been validated on the following configurations: | Python version | GPU | Configuration file | | - | - | - | -| 3.10 (conda) | 2 node x 8xNVIDIA A100 80GB | config/standard.yaml | -| 3.9.12 (conda) | 8x NVIDIA RTX8000 48GB | config/standard.yaml | -| 3.9.16 (conda) | 2x NVIDIA K80 | config/ci.yaml | -| 3.9.16 (conda) | 2x AMD MI100 | config/ci.yaml | -| 3.9.16 (conda) | 4x AMD MI250 | config/standard.yaml | +| 3.10 | 2 node x 8xNVIDIA A100 80GB | config/standard.yaml | +| 3.10 | 2 node x 8xMI300X | config/standard.yaml | +| 3.10 | 1 node x 8xGaudi2 | config/standard.yaml | We are working on validating it on more configurations and will update the above table as we do. - - +## Report + + ================= + Benchmark results + ================= + + System + ------ + cpu: AMD EPYC 7742 64-Core Processor + n_cpu: 128 + product: NVIDIA A100-SXM4-80GB + n_gpu: 8 + memory: 81920.0 + + Breakdown + --------- + bench | fail | n | ngpu | perf | sem% | std% | peak_memory | score | weight + brax | 0 | 1 | 8 | 730035.71 | 0.1% | 0.4% | 2670 | 730035.71 | 1.00 + diffusion-gpus | 0 | 1 | 8 | 117.67 | 1.5% | 11.7% | 59944 | 117.67 | 1.00 + diffusion-single | 0 | 8 | 1 | 25.02 | 0.8% | 17.9% | 53994 | 202.10 | 1.00 + dimenet | 0 | 8 | 1 | 366.85 | 0.7% | 16.2% | 2302 | 2973.32 | 1.00 + dinov2-giant-gpus | 0 | 1 | 8 | 445.68 | 0.4% | 3.0% | 69614 | 445.68 | 1.00 + dinov2-giant-single | 0 | 8 | 1 | 53.54 | 0.4% | 9.5% | 74646 | 432.65 | 1.00 + dqn | 0 | 8 | 1 | 23089954554.91 | 1.1% | 89.9% | 62106 | 184480810548.20 | 1.00 + bf16 | 0 | 8 | 1 | 293.43 | 0.2% | 6.3% | 1788 | 2361.16 | 0.00 + fp16 | 0 | 8 | 1 | 289.26 | 0.1% | 3.6% | 1788 | 2321.65 | 0.00 + fp32 | 0 | 8 | 1 | 19.14 | 0.0% | 0.7% | 2166 | 153.21 | 0.00 + tf32 | 0 | 8 | 1 | 146.63 | 0.1% | 3.6% | 2166 | 1177.04 | 0.00 + bert-fp16 | 0 | 8 | 1 | 263.73 | 1.1% | 16.7% | nan | 2165.37 | 0.00 + bert-fp32 | 0 | 8 | 1 | 44.84 | 0.6% | 9.6% | 21170 | 364.52 | 0.00 + bert-tf32 | 0 | 8 | 1 | 141.95 | 0.9% | 14.1% | 1764 | 1162.94 | 0.00 + bert-tf32-fp16 | 0 | 8 | 1 | 265.04 | 1.0% | 15.6% | nan | 2175.59 | 3.00 + reformer | 0 | 8 | 1 | 62.29 | 0.3% | 6.0% | 25404 | 501.89 | 1.00 + t5 | 0 | 8 | 1 | 51.40 | 0.5% | 9.9% | 34390 | 416.14 | 2.00 + whisper | 0 | 8 | 1 | 481.95 | 1.0% | 21.4% | 8520 | 3897.53 | 1.00 + lightning | 0 | 8 | 1 | 680.22 | 1.0% | 22.7% | 27360 | 5506.90 | 1.00 + lightning-gpus | 0 | 1 | 8 | 3504.74 | 7.9% | 62.9% | 28184 | 3504.74 | 1.00 + llava-single | 1 | 8 | 1 | 2.28 | 0.4% | 9.6% | 72556 | 14.12 | 1.00 + llama | 0 | 8 | 1 | 484.86 | 4.4% | 80.0% | 27820 | 3680.86 | 1.00 + llm-full-mp-gpus | 0 | 1 | 8 | 193.92 | 3.1% | 16.2% | 48470 | 193.92 | 1.00 + llm-lora-ddp-gpus | 0 | 1 | 8 | 16738.58 | 0.4% | 2.0% | 36988 | 16738.58 | 1.00 + llm-lora-mp-gpus | 0 | 1 | 8 | 1980.63 | 2.2% | 11.8% | 55972 | 1980.63 | 1.00 + llm-lora-single | 0 | 8 | 1 | 2724.95 | 0.2% | 3.0% | 49926 | 21861.99 | 1.00 + ppo | 0 | 8 | 1 | 3114264.32 | 1.6% | 57.2% | 62206 | 24915954.98 | 1.00 + recursiongfn | 0 | 8 | 1 | 7080.67 | 1.2% | 27.1% | 10292 | 57038.34 | 1.00 + rlhf-gpus | 0 | 1 | 8 | 6314.94 | 2.1% | 11.2% | 21730 | 6314.94 | 1.00 + rlhf-single | 0 | 8 | 1 | 1143.72 | 0.4% | 8.4% | 19566 | 9174.52 | 1.00 + focalnet | 0 | 8 | 1 | 375.07 | 0.7% | 14.9% | 23536 | 3038.83 | 2.00 + torchatari | 0 | 8 | 1 | 5848.88 | 0.6% | 12.7% | 3834 | 46613.34 | 1.00 + convnext_large-fp16 | 0 | 8 | 1 | 330.93 | 1.5% | 22.9% | 27376 | 2711.46 | 0.00 + convnext_large-fp32 | 0 | 8 | 1 | 59.49 | 0.6% | 9.8% | 55950 | 483.84 | 0.00 + convnext_large-tf32 | 0 | 8 | 1 | 155.41 | 0.9% | 14.3% | 49650 | 1273.31 | 0.00 + convnext_large-tf32-fp16 | 0 | 8 | 1 | 322.28 | 1.6% | 24.5% | 27376 | 2637.88 | 3.00 + regnet_y_128gf | 0 | 8 | 1 | 119.46 | 0.5% | 10.0% | 29762 | 966.96 | 2.00 + resnet152-ddp-gpus | 0 | 1 | 8 | 3843.06 | 5.2% | 39.3% | 27980 | 3843.06 | 0.00 + resnet50 | 0 | 8 | 1 | 932.95 | 2.4% | 52.2% | 14848 | 7524.25 | 1.00 + resnet50-noio | 0 | 8 | 1 | 1163.88 | 0.3% | 6.7% | 27480 | 9385.35 | 0.00 + vjepa-gpus | 0 | 1 | 8 | 130.13 | 5.9% | 46.8% | 64244 | 130.13 | 1.00 + vjepa-single | 0 | 8 | 1 | 21.29 | 1.0% | 22.4% | 58552 | 172.11 | 1.00 + + Scores + ------ + Failure rate: 0.38% (PASS) + Score: 4175.57 + + Errors + ------ + 1 errors, details in HTML report. \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index a3f4e1b45..281e1d0af 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-18-g784b38e" -__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179" -__date__ = "2024-10-18 15:58:46 +0000" +__tag__ = "v0.1.0-146-ga8415d3" +__commit__ = "a8415d3da9f91aa1ac23d932dff2c70fe580e556" +__date__ = "2024-11-21 14:35:55 -0500" diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index 0c2c1dae0..9ef13b7d3 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -84,15 +84,12 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then . $MILABENCH_WORDIR/env/bin/activate - - # pip install torch # milabench pin --variant cuda --from-scratch # rm -rf $MILABENCH_WORDIR/results/venv/ - rm -rf $MILABENCH_WORDIR/results/extra - - milabench install --system $MILABENCH_WORDIR/system.yaml - milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS + # rm -rf $MILABENCH_WORDIR/results/extra + # milabench install --system $MILABENCH_WORDIR/system.yaml + # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS ( . $BENCHMARK_VENV/bin/activate diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt deleted file mode 100644 index 3a511bb65..000000000 --- a/tests/test_command_reg/test_command_reg_one_node.txt +++ /dev/null @@ -1,604 +0,0 @@ -#!/bin/sh - -echo "---" -echo "Virtual Env" -echo "===========" -export VIRTUAL_ENV=$BASE/venv/torch - -source $VIRTUAL_ENV/bin/activate -echo "---" -echo "Milabench" -echo "=========" -export MILABENCH_DIR_BASE=$BASE -export MILABENCH_DIR_VENV=$BASE/venv/torch -export MILABENCH_DIR_DATA=$BASE/data -export MILABENCH_DIR_RUNS=$BASE/runs -export MILABENCH_DIR_EXTRA=$BASE/extra/llm -export MILABENCH_DIR_CACHE=$BASE/cache -export OMP_NUM_THREADS=0 -export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' - -echo "---" -echo "llama" -echo "=====" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - wait -) - -echo "---" -echo "fp16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - wait -) - -echo "---" -echo "bf16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - wait -) - -echo "---" -echo "tf32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - wait -) - -echo "---" -echo "fp32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - wait -) - -echo "---" -echo "resnet50" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet50-noio" -echo "=============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet152-ddp-gpus" -echo "==================" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "convnext_large-fp32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-fp16" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32-fp16" -echo "========================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "regnet_y_128gf" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - wait -) - -echo "---" -echo "bert-fp32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-fp16" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32-fp16" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "t5" -echo "==" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - wait -) - -echo "---" -echo "reformer" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - wait -) - -echo "---" -echo "whisper" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - wait -) - -echo "---" -echo "focalnet" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & - wait -) - -echo "---" -echo "brax" -echo "====" -time ( - python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & - wait -) - -echo "---" -echo "diffusion-single" -echo "================" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-gpus" -echo "==============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-nodes" -echo "===============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "lightning" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "lightning-gpus" -echo "==============" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "dinov2-giant-single" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "dinov2-giant-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "llm-lora-single" -echo "===============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-nodes" -echo "==================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-nodes" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "dqn" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - wait -) - -echo "---" -echo "ppo" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - wait -) - -echo "---" -echo "dimenet" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - wait -) - -echo "---" -echo "recursiongfn" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - wait -) - -echo "---" -echo "torchatari" -echo "==========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - wait -) - -echo "---" -echo "llava-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - wait -) - -echo "---" -echo "rlhf-single" -echo "===========" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "rlhf-gpus" -echo "=========" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "vjepa-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - wait -) - -echo "---" -echo "vjepa-gpus" -echo "==========" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus & - wait -) - diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt deleted file mode 100644 index 3004505de..000000000 --- a/tests/test_command_reg/test_command_reg_two_nodes.txt +++ /dev/null @@ -1,607 +0,0 @@ -#!/bin/sh - -echo "---" -echo "Virtual Env" -echo "===========" -export VIRTUAL_ENV=$BASE/venv/torch - -source $VIRTUAL_ENV/bin/activate -echo "---" -echo "Milabench" -echo "=========" -export MILABENCH_DIR_BASE=$BASE -export MILABENCH_DIR_VENV=$BASE/venv/torch -export MILABENCH_DIR_DATA=$BASE/data -export MILABENCH_DIR_RUNS=$BASE/runs -export MILABENCH_DIR_EXTRA=$BASE/extra/llm -export MILABENCH_DIR_CACHE=$BASE/cache -export OMP_NUM_THREADS=0 -export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' - -echo "---" -echo "llama" -echo "=====" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - wait -) - -echo "---" -echo "fp16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - wait -) - -echo "---" -echo "bf16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - wait -) - -echo "---" -echo "tf32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - wait -) - -echo "---" -echo "fp32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - wait -) - -echo "---" -echo "resnet50" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet50-noio" -echo "=============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet152-ddp-gpus" -echo "==================" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "convnext_large-fp32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-fp16" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32-fp16" -echo "========================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "regnet_y_128gf" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - wait -) - -echo "---" -echo "bert-fp32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-fp16" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32-fp16" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "t5" -echo "==" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - wait -) - -echo "---" -echo "reformer" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - wait -) - -echo "---" -echo "whisper" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - wait -) - -echo "---" -echo "focalnet" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & - wait -) - -echo "---" -echo "brax" -echo "====" -time ( - python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & - wait -) - -echo "---" -echo "diffusion-single" -echo "================" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-gpus" -echo "==============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-nodes" -echo "===============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "lightning" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "lightning-gpus" -echo "==============" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "dinov2-giant-single" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "dinov2-giant-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "llm-lora-single" -echo "===============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-nodes" -echo "==================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-nodes" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "dqn" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - wait -) - -echo "---" -echo "ppo" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - wait -) - -echo "---" -echo "dimenet" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - wait -) - -echo "---" -echo "recursiongfn" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - wait -) - -echo "---" -echo "torchatari" -echo "==========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - wait -) - -echo "---" -echo "llava-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - wait -) - -echo "---" -echo "rlhf-single" -echo "===========" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "rlhf-gpus" -echo "=========" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "vjepa-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - wait -) - -echo "---" -echo "vjepa-gpus" -echo "==========" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus & - wait -) - diff --git a/tests/test_capabilities.py b/tests/test_validation/test_capabilities.py similarity index 100% rename from tests/test_capabilities.py rename to tests/test_validation/test_capabilities.py