From 36056e0760ab22b83f4b4c59b3c4886e942a9725 Mon Sep 17 00:00:00 2001 From: chenyu Date: Fri, 11 Oct 2024 16:20:34 -0400 Subject: [PATCH] update mlperf systems and copy 4.1 to 5.0 (#7004) --- .../tinycorp/systems/tinybox_green.json | 4 +- .../tinycorp/systems/tinybox_red.json | 6 +- .../implementations/tinybox_green/README.md | 73 +++++++++++++++++++ .../implementations/tinybox_green/dev_beam.sh | 13 ++++ .../implementations/tinybox_green/dev_run.sh | 13 ++++ .../tinybox_green/run_and_time.sh | 23 ++++++ .../implementations/tinybox_red/README.md | 73 +++++++++++++++++++ .../implementations/tinybox_red/dev_beam.sh | 13 ++++ .../implementations/tinybox_red/dev_run.sh | 13 ++++ .../tinybox_red/run_and_time.sh | 23 ++++++ .../bert/implementations/tinybox_red/setup.sh | 8 ++ .../implementations/tinybox_green/README.md | 50 +++++++++++++ .../implementations/tinybox_green/dev_beam.sh | 13 ++++ .../implementations/tinybox_green/dev_run.sh | 15 ++++ .../tinybox_green/run_and_time.sh | 23 ++++++ .../implementations/tinybox_red/README.md | 50 +++++++++++++ .../implementations/tinybox_red/dev_beam.sh | 13 ++++ .../implementations/tinybox_red/dev_run.sh | 15 ++++ .../tinybox_red/run_and_time.sh | 23 ++++++ .../implementations/tinybox_red/setup.sh | 8 ++ .../tinycorp/systems/tinybox_green.json | 38 ++++++++++ .../tinycorp/systems/tinybox_red.json | 38 ++++++++++ 22 files changed, 543 insertions(+), 5 deletions(-) create mode 100644 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh create mode 100644 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh create mode 100644 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh create mode 100644 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh create mode 100755 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh create mode 100644 examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json create mode 100644 examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_green.json b/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_green.json index 3eb042750258d..bb1ebba98c70c 100644 --- a/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_green.json +++ b/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_green.json @@ -1,7 +1,7 @@ { "submitter": "tinycorp", "division": "closed", - "status": "available", + "status": "Available on-premise", "system_name": "tinybox green", "number_of_nodes": "1", "host_processors_per_node": "1", @@ -28,7 +28,7 @@ "accelerator_interconnect_topology": "", "cooling": "air", "hw_notes": "", - "framework": "tinygrad, commit 0e8aa0e2886bf9a2d3ce093bce87305e182e6d4a", + "framework": "tinygrad, commit b5546912e24e0a864b35924da4efa5d71cfe368b", "other_software_stack": { "python": "3.10.12", "CUDA": "12.4" diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_red.json b/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_red.json index 2f41680df3e98..6db104a7dbfce 100644 --- a/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_red.json +++ b/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_red.json @@ -1,7 +1,7 @@ { "submitter": "tinycorp", "division": "closed", - "status": "available", + "status": "Available on-premise", "system_name": "tinybox red", "number_of_nodes": "1", "host_processors_per_node": "1", @@ -28,10 +28,10 @@ "accelerator_interconnect_topology": "", "cooling": "air", "hw_notes": "", - "framework": "tinygrad, commit 0e8aa0e2886bf9a2d3ce093bce87305e182e6d4a", + "framework": "tinygrad, commit b5546912e24e0a864b35924da4efa5d71cfe368b", "other_software_stack": { "python": "3.10.12", - "ROCm": "6.1" + "ROCm": "6.1.3" }, "operating_system": "Ubuntu 22.04.4", "sw_notes": "" diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md new file mode 100644 index 0000000000000..e79373658adec --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md @@ -0,0 +1,73 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install tqdm and tensorflow. +``` +pip install tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr + increase mes timeout. +Install the custom amdgpu driver per [README](https://github.com/nimlgen/amdgpu_ubuntu_22_04/blob/v6.1.3/readme.md) + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### One time setup + +``` +examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh +``` + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` \ No newline at end of file diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh new file mode 100755 index 0000000000000..8814f62919fea --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 + +export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh new file mode 100755 index 0000000000000..8df5f5daddc57 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 + +export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh new file mode 100755 index 0000000000000..8adeb7a87729c --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 + +export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md new file mode 100644 index 0000000000000..e79373658adec --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md @@ -0,0 +1,73 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install tqdm and tensorflow. +``` +pip install tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr + increase mes timeout. +Install the custom amdgpu driver per [README](https://github.com/nimlgen/amdgpu_ubuntu_22_04/blob/v6.1.3/readme.md) + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### One time setup + +``` +examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh +``` + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` \ No newline at end of file diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh new file mode 100755 index 0000000000000..649d3942efcfb --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 + +export BEAM=3 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh new file mode 100755 index 0000000000000..9a6ee03432a6e --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 + +export BEAM=3 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh new file mode 100755 index 0000000000000..9ca38adbcc588 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_red" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6 + +export BEAM=3 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_red_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh new file mode 100755 index 0000000000000..3d687cdb98b39 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +rocm-smi --setprofile compute +rocm-smi --setmclk 3 +rocm-smi --setperflevel high + +# power cap to 350W +# echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md new file mode 100644 index 0000000000000..d380cec5b55e1 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md @@ -0,0 +1,50 @@ +# 1. Problem + +This problem uses the ResNet-50 CNN to do image classification. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr +This is the default on production tinybox red. +``` +sudo vi /etc/modprobe.d/amdgpu.conf +cat < /etc/modprobe.d/amdgpu.conf +options amdgpu cwsr_enable=0 +EOF +sudo update-initramfs -u +sudo reboot + +# validate +sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0 +``` + +# 2. Directions + +## Steps to download and verify data + +``` +IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py +``` + +## Steps for one time setup + +### tinybox_red +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh +``` + +## Steps to run benchmark +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +``` diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh new file mode 100755 index 0000000000000..35bf39887e8fc --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export LAZYCACHE=0 RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh new file mode 100755 index 0000000000000..a13067da8a0d1 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export LAZYCACHE=0 RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +export EVAL_START_EPOCH=3 EVAL_FREQ=4 + +export WANDB=1 PARALLEL=0 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh new file mode 100755 index 0000000000000..38fd9370bd782 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="resnet" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export LAZYCACHE=0 RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="resnet_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md new file mode 100644 index 0000000000000..d380cec5b55e1 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md @@ -0,0 +1,50 @@ +# 1. Problem + +This problem uses the ResNet-50 CNN to do image classification. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr +This is the default on production tinybox red. +``` +sudo vi /etc/modprobe.d/amdgpu.conf +cat < /etc/modprobe.d/amdgpu.conf +options amdgpu cwsr_enable=0 +EOF +sudo update-initramfs -u +sudo reboot + +# validate +sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0 +``` + +# 2. Directions + +## Steps to download and verify data + +``` +IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py +``` + +## Steps for one time setup + +### tinybox_red +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh +``` + +## Steps to run benchmark +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +``` diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh new file mode 100755 index 0000000000000..581781b8645c1 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export LAZYCACHE=0 RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh new file mode 100755 index 0000000000000..38e25d361edfa --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export LAZYCACHE=0 RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export EVAL_START_EPOCH=3 EVAL_FREQ=4 + +export WANDB=1 PARALLEL=0 + +python3 examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh new file mode 100755 index 0000000000000..eb173a1930f4c --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="resnet" +export SUBMISSION_PLATFORM="tinybox_red" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export LAZYCACHE=0 RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="resnet_red_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh new file mode 100755 index 0000000000000..a9806164f4cd0 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +rocm-smi --setprofile compute +rocm-smi --setmclk 3 +rocm-smi --setperflevel high + +# power cap to 350W +echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json b/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json new file mode 100644 index 0000000000000..bb1ebba98c70c --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json @@ -0,0 +1,38 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox green", + "number_of_nodes": "1", + "host_processors_per_node": "1", + "host_processor_model_name": "AMD EPYC 7532 32-Core Processor", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "128GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "4 TB raid array + 1 TB boot", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "8x 16GB DDR4", + "accelerators_per_node": "6", + "accelerator_model_name": "NVIDIA GeForce RTX 4090", + "accelerator_host_interconnect": "PCIe 4.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "GDDR6X", + "accelerator_memory_capacity": "24GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, commit b5546912e24e0a864b35924da4efa5d71cfe368b", + "other_software_stack": { + "python": "3.10.12", + "CUDA": "12.4" + }, + "operating_system": "Ubuntu 22.04.4", + "sw_notes": "" +} diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json b/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json new file mode 100644 index 0000000000000..6db104a7dbfce --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json @@ -0,0 +1,38 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox red", + "number_of_nodes": "1", + "host_processors_per_node": "1", + "host_processor_model_name": "AMD EPYC 7532 32-Core Processor", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "128GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "4 TB raid array + 1 TB boot", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "8x 16GB DDR4", + "accelerators_per_node": "6", + "accelerator_model_name": "AMD Radeon RX 7900 XTX", + "accelerator_host_interconnect": "PCIe 4.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "GDDR6", + "accelerator_memory_capacity": "24GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, commit b5546912e24e0a864b35924da4efa5d71cfe368b", + "other_software_stack": { + "python": "3.10.12", + "ROCm": "6.1.3" + }, + "operating_system": "Ubuntu 22.04.4", + "sw_notes": "" +}