forked from tinygrad/tinygrad
-
Notifications
You must be signed in to change notification settings - Fork 0
164 lines (160 loc) · 6.64 KB
/
benchmark.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
name: Benchmarks
on:
push:
branches:
- master
- update_benchmark
jobs:
testmacbenchmark:
name: Mac Benchmark
runs-on: [self-hosted, macOS]
defaults:
run:
shell: bash -o pipefail {0}
if: github.repository_owner == 'tinygrad'
env:
PYTHONPATH: .
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Symlink models and datasets
run: |
mkdir -p weights
ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu
ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
# TODO: why is this test not reliable?
#- name: Run Stable Diffusion
# run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
- name: Run model inference benchmark
run: METAL=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
- name: Run Tensor Core GEMM
run: |
DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
DEBUG=2 HALF=1 python3 extra/gemm/simple_matmul.py | tee matmul_half.txt
- name: Run LLaMA
run: |
JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run GPT2
run: |
JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run 10 CIFAR training steps
run: STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
# TODO: this is flaky too
# - name: Run 10 CIFAR training steps w winograd
# run: WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (Mac)
path: |
onnx_inference_speed.csv
torch_speed.txt
train_cifar.txt
train_cifar_wino.txt
llama_unjitted.txt
llama_jitted.txt
gpt2_unjitted.txt
gpt2_jitted.txt
matmul.txt
matmul_half.txt
sd.txt
testnvidiabenchmark:
name: NVIDIA Benchmark
runs-on: [self-hosted, Linux, CUDA]
defaults:
run:
shell: bash -o pipefail {0}
if: github.repository_owner == 'tinygrad'
env:
PYTHONPATH: .
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Run model inference benchmark
run: CUDA=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: CUDA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
- name: Run GPT2
run: |
CUDA=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
CUDA=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run GPT2 w HALF
run: CUDA=1 JIT=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing
- name: Run GPT2 w HALF/BEAM
run: CUDA=1 JIT=1 HALF=1 BEAM=2 CACHELEVEL=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA)
path: |
onnx_inference_speed.csv
torch_speed.txt
gpt2_unjitted.txt
gpt2_jitted.txt
gpt2_half_beam.txt
testamdbenchmark:
name: tinybox Benchmark
runs-on: [self-hosted, Linux, tinybox]
defaults:
run:
shell: bash -o pipefail {0}
if: github.repository_owner == 'tinygrad'
env:
PYTHONPATH: .
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Show off tinybox
run: /opt/rocm/bin/rocm-bandwidth-test
- name: Symlink models and datasets
run: |
mkdir -p weights
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
- name: Run model inference benchmark
run: GPU=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
- name: Run Tensor Core GEMM
run: HIP=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
- name: Run Stable Diffusion
run: python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt
- name: Run LLaMA (with HIP)
run: |
HIP=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
HIP=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
- name: Run GPT2 (with HIP)
run: |
HIP=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt
HIP=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt
- name: Run 10 CIFAR training steps
run: STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run full CIFAR training w HALF
run: time HALF=1 STEPS=1000 python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
# # TODO: make wino faster so we can enable both
# - name: Run 10 CIFAR training steps w winograd
# run: WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
# - name: Run 10 CIFAR training steps w WINO/HALF/HIP
# run: HALF=1 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino_half_hip.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD)
path: |
onnx_inference_speed.csv
torch_speed.txt
train_cifar.txt
train_cifar_half.txt
train_cifar_wino.txt
train_cifar_wino_half_hip.txt
llama_unjitted.txt
llama_jitted.txt
gpt2_unjitted.txt
gpt2_jitted.txt
matmul.txt
sd.txt