update mlperf systems and copy 4.1 to 5.0 (tinygrad#7004)

chenyuxyz · Oct 11, 2024 · 36056e0 · 36056e0
1 parent 8831c69
commit 36056e0
Show file tree

Hide file tree

Showing 22 changed files with 543 additions and 5 deletions.
diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_green.json b/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_green.json
@@ -1,7 +1,7 @@
 {
   "submitter": "tinycorp",
   "division": "closed",
-  "status": "available",
+  "status": "Available on-premise",
   "system_name": "tinybox green",
   "number_of_nodes": "1",
   "host_processors_per_node": "1",
@@ -28,7 +28,7 @@
   "accelerator_interconnect_topology": "",
   "cooling": "air",
   "hw_notes": "",
-  "framework": "tinygrad, commit 0e8aa0e2886bf9a2d3ce093bce87305e182e6d4a",
+  "framework": "tinygrad, commit b5546912e24e0a864b35924da4efa5d71cfe368b",
   "other_software_stack": {
     "python": "3.10.12",
     "CUDA": "12.4"

diff --git a/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_red.json b/examples/mlperf/training_submission_v4.1/tinycorp/systems/tinybox_red.json
@@ -1,7 +1,7 @@
 {
   "submitter": "tinycorp",
   "division": "closed",
-  "status": "available",
+  "status": "Available on-premise",
   "system_name": "tinybox red",
   "number_of_nodes": "1",
   "host_processors_per_node": "1",
@@ -28,10 +28,10 @@
   "accelerator_interconnect_topology": "",
   "cooling": "air",
   "hw_notes": "",
-  "framework": "tinygrad, commit 0e8aa0e2886bf9a2d3ce093bce87305e182e6d4a",
+  "framework": "tinygrad, commit b5546912e24e0a864b35924da4efa5d71cfe368b",
   "other_software_stack": {
     "python": "3.10.12",
-    "ROCm": "6.1"
+    "ROCm": "6.1.3"
   },
   "operating_system": "Ubuntu 22.04.4",
   "sw_notes": ""

diff --git a/...ubmission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md b/...ubmission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
@@ -0,0 +1,73 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install tqdm and tensorflow.
+```
+pip install tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr + increase mes timeout.
+Install the custom amdgpu driver per [README](https://github.com/nimlgen/amdgpu_ubuntu_22_04/blob/v6.1.3/readme.md)
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### One time setup
+
+```
+examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh
+```
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
diff --git a/...aining_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/...aining_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+
+export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 DEBUG=2
+
+python3 examples/mlperf/model_train.py
diff --git a/...raining_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/...raining_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+
+export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
diff --git a/...ng_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/...ng_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+
+export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
diff --git a/..._submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md b/..._submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
@@ -0,0 +1,73 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install tqdm and tensorflow.
+```
+pip install tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr + increase mes timeout.
+Install the custom amdgpu driver per [README](https://github.com/nimlgen/amdgpu_ubuntu_22_04/blob/v6.1.3/readme.md)
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### One time setup
+
+```
+examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh
+```
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
diff --git a/...training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/...training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+
+export BEAM=3
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 DEBUG=2
+
+python3 examples/mlperf/model_train.py
diff --git a/.../training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/.../training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+
+export BEAM=3
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
diff --git a/...ning_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/...ning_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_red"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+
+export BEAM=3
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_red_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
diff --git a/...rf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh b/...rf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+rocm-smi --setprofile compute
+rocm-smi --setmclk 3
+rocm-smi --setperflevel high
+
+# power cap to 350W
+# echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap
diff --git a/...mission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md b/...mission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
@@ -0,0 +1,50 @@
+# 1. Problem
+
+This problem uses the ResNet-50 CNN to do image classification.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr
+This is the default on production tinybox red.
+```
+sudo vi /etc/modprobe.d/amdgpu.conf
+cat <<EOF > /etc/modprobe.d/amdgpu.conf
+options amdgpu cwsr_enable=0
+EOF
+sudo update-initramfs -u
+sudo reboot
+
+# validate
+sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
+```
+
+# 2. Directions
+
+## Steps to download and verify data
+
+```
+IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
+```
+
+## Steps for one time setup
+
+### tinybox_red
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+```
+
+## Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+```
diff --git a/...ning_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh b/...ning_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export LAZYCACHE=0 RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+export BENCHMARK=10 DEBUG=2
+
+python3 examples/mlperf/model_train.py
diff --git a/...ining_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh b/...ining_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="."
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export LAZYCACHE=0 RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+export EVAL_START_EPOCH=3 EVAL_FREQ=4
+
+export WANDB=1 PARALLEL=0
+
+python3 examples/mlperf/model_train.py