This repository has been archived by the owner on Nov 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test: add tests comparing the fused op with unfused op
- Loading branch information
Showing
2 changed files
with
22 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "LuxLib" | ||
uuid = "82251201-b29d-42c6-8e01-566dec8acb11" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.2.0" | ||
version = "1.2.1-DEV" | ||
|
||
[deps] | ||
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
897d842
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LuxLib Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5312.5
ns5666
ns0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
7792
ns7459
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
8000
ns8458
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6958.5
ns7291
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
119033
ns119078
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
2748386
ns2538616
ns1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal
825375
ns702792
ns1.17
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
401934
ns427074
ns0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9583
ns10020.5
ns0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9875
ns9750
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9875
ns10250
ns0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9979
ns9895.5
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
554263
ns551531
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
18522254
ns18148603
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal
2713291
ns2222000
ns1.22
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
671997
ns679576
ns0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
7645.5
ns1271
ns6.02
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
7500
ns2729
ns2.75
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
9750
ns1708.5
ns5.71
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
8521
ns1708.5
ns4.99
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
23694
ns21712
ns1.09
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI
1287189.5
ns1291875
ns1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal
222062.5
ns183666
ns1.21
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU
31840
ns31345.5
ns1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4770.5
ns3500
ns1.36
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
5041
ns3333
ns1.51
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
5583.5
ns4208.5
ns1.33
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
5062
ns4375
ns1.16
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
145766
ns146456.5
ns1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI
8553462
ns8037303.5
ns1.06
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal
1568604.5
ns1510917
ns1.04
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU
146901
ns146682
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
56917
ns56500
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47083
ns46875
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47375
ns46833
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82792
ns83459
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
39154
ns36990
ns1.06
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
578385
ns664843
ns0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1060708
ns1340625
ns0.79
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
81970
ns80736
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2023687.5
ns2031000
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2084333.5
ns2086333.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2097166.5
ns2089292
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1996667
ns1995354
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
220055
ns232927.5
ns0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
7652457
ns7734526
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
5389292
ns4323958
ns1.25
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1353254
ns1581446
ns0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
147146
ns147042
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
149750
ns144625
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
146270.5
ns149833
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
150667
ns151895.5
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
165828.5
ns166087
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7443338.5
ns7754863
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1542042
ns1479250
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
204932
ns198942
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1114916
ns1120063
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1110250
ns1117666
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1120437.5
ns1115750
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1114709
ns1124875
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
688383
ns721156.5
ns0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
33804204
ns33562933.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
6685792
ns6149062.5
ns1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1030010.5
ns1022579
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4479
ns4166
ns1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5417
ns5041.5
ns1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5583
ns6042
ns0.92
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4334
ns6250
ns0.69
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
91302
ns95202.5
ns0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
5274846.5
ns5313078
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal
449229
ns416333.5
ns1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
69581
ns65661
ns1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8458
ns9000
ns0.94
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8625
ns8709
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9292
ns9375
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8375
ns8417
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
588432.5
ns618225
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
35019318
ns31699887
ns1.10
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal
6040187
ns5433375
ns1.11
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
387324
ns388724
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17146
ns16229.5
ns1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18438
ns17500
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21500
ns21916
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17229.5
ns18542
ns0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
66199
ns68340
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
2949345
ns3114761
ns0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1266312.5
ns455354.5
ns2.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
76211
ns75821
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
215958
ns213125
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
219125
ns212125
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
215188
ns214749.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
221708
ns223791
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
351090
ns361191
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
14025976.5
ns13957207
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
5667541.5
ns5399125
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
469564
ns468614
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
7584
ns625
ns12.13
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
8166.5
ns667
ns12.24
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
11750
ns875
ns13.43
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
8562.5
ns708
ns12.09
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
22778
ns20782
ns1.10
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI
1173529
ns1176905
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal
301791
ns179000
ns1.69
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU
32530
ns31201
ns1.04
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2209
ns1458
ns1.52
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2417
ns1500
ns1.61
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2916.5
ns1541
ns1.89
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2375
ns1333.5
ns1.78
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
126097.5
ns128010.5
ns0.99
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI
9653655
ns9057994
ns1.07
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal
1533792
ns1474521
ns1.04
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU
135982
ns136491
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
14041
ns7333
ns1.91
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
14167
ns6166
ns2.30
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
14458
ns6166
ns2.34
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16709
ns10291
ns1.62
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
32839
ns24318
ns1.35
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1251313.5
ns1193537
ns1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
609208
ns341583
ns1.78
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
56260
ns47631
ns1.18
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
227375
ns231125
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
275292
ns270583
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
275000
ns270375
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
261458
ns213167
ns1.23
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
202099.5
ns195209.5
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
31323717.5
ns31467862
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
8740042
ns9233666
ns0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
655201
ns645516
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4125
ns4125
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4125
ns4125
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4125
ns4125
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4083
ns4125
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
22662
ns23938.5
ns0.95
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI
2051373
ns2014824
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal
219958
ns210750
ns1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU
46610
ns48021
ns0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
21041
ns16916
ns1.24
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
21791
ns17417
ns1.25
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
22250
ns17208
ns1.29
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
20917
ns16667
ns1.25
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
205015
ns198962
ns1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI
10110167
ns10294946
ns0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal
975584
ns900625
ns1.08
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU
182977
ns172967
ns1.06
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
509167
ns508125
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
404417
ns404416
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
405000
ns404792
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
864791
ns865375
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113334.5
ns113291
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI
392728
ns429336
ns0.91
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal
421604.5
ns432708
ns0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU
240942
ns242113
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2318229.5
ns2329437
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
2030833
ns2034750
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
2041375
ns2031750
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3280292
ns3193375
ns1.03
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
250973.5
ns246406
ns1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI
12331811
ns12521873.5
ns0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal
1903125
ns1893250
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
725307
ns744268
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5375
ns5187.5
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7604
ns7083
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8500
ns7354
ns1.16
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6458.5
ns7542
ns0.86
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
89376.5
ns93165
ns0.96
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
5476745.5
ns5491281
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal
762334
ns752833
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
64761
ns65211
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10583
ns12167
ns0.87
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11958
ns11792
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11958
ns12374.5
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10792
ns11396
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
632512
ns647871
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
38621174
ns39284056
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal
5666041
ns5190667
ns1.09
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
401324
ns411409
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
2625
ns500
ns5.25
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
2958
ns541
ns5.47
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
3250
ns500
ns6.50
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
2792
ns500
ns5.58
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
30482.5
ns23724
ns1.28
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI
2134963.5
ns2212056
ns0.97
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal
340083
ns204584
ns1.66
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU
54341
ns47141
ns1.15
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
10750
ns2125
ns5.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
11833
ns2125
ns5.57
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
13000
ns2167
ns6.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
10625
ns2125
ns5
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
252151
ns227021
ns1.11
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI
13218302
ns11087876.5
ns1.19
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal
1962708.5
ns1921834
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU
189561.5
ns172882
ns1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
26500
ns8208
ns3.23
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
31771
ns9146
ns3.47
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
35000
ns9959
ns3.51
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
28479
ns8375
ns3.40
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
121854.5
ns104776
ns1.16
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
3225119
ns3291769.5
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal
730917
ns468500
ns1.56
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
80315.5
ns72700.5
ns1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
22791.5
ns17374.5
ns1.31
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
25542
ns18625
ns1.37
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
25334
ns18250
ns1.39
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
23000
ns18125
ns1.27
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
616060
ns580515
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
18160748
ns17620571
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal
5306187.5
ns4970938
ns1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
388424
ns381279
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
1667
ns459
ns3.63
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
2000
ns584
ns3.42
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
2167
ns625
ns3.47
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
1834
ns458
ns4.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
40493
ns35839
ns1.13
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
1262646
ns1218575
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal
296417
ns423541
ns0.70
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
48340
ns46311
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10000
ns9104
ns1.10
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11187.5
ns9333
ns1.20
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11958
ns9083
ns1.32
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10583
ns9208
ns1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
266372
ns261166
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
19255152
ns18752145
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal
4716875
ns4335125
ns1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
379563.5
ns367929
ns1.03
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
396417
ns395708
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
287875
ns288375
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288125
ns288375
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
756000
ns756292
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111465
ns111964.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI
323864.5
ns329610
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal
367958
ns303771
ns1.21
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU
75531
ns75611
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1453958.5
ns1445541
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1136125
ns1129292
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1142437.5
ns1133875
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2444854
ns2356333
ns1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
219029
ns210839
ns1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI
10210546
ns10091107
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal
1657083
ns1639416
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU
327328
ns322414
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7042
ns7042
ns1
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8250
ns8000
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8833
ns8833.5
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7209
ns7520.5
ns0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
141318
ns142989
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
5924265.5
ns5929780
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal
440833
ns470791.5
ns0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
65171
ns66011
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
12000
ns16208
ns0.74
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14812
ns14250
ns1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14750
ns16000
ns0.92
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
11917
ns15354.5
ns0.78
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
936057
ns963872.5
ns0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
42717572.5
ns42665593.5
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal
5924541.5
ns5541125
ns1.07
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
423354
ns426829
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
23584
ns24458
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
29312.5
ns26062.5
ns1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
31187
ns29916.5
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24833
ns25708.5
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
197551
ns202495.5
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7700816.5
ns8124671
ns0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
605479
ns985584
ns0.61
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
114941
ns114461
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
108084
ns109083
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
124167
ns152250
ns0.82
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
154542
ns152854
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
151166.5
ns142750
ns1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1062517
ns1066908
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
44916929
ns41393438
ns1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6076604
ns5472042
ns1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
587946
ns588251
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
73750
ns75167
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
82958
ns74583
ns1.11
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
78916
ns84375
ns0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
77042
ns74125
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
204012
ns208606
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7603031.5
ns7473638
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
530625
ns500875
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
129202
ns129022
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
209875
ns304417
ns0.69
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
218333
ns302145.5
ns0.72
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
286250
ns267604
ns1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
224708
ns221146.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1104104
ns1119561.5
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
40856770
ns40462234
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6448250
ns6061271
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
693286
ns695387
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
15687.5
ns15729.5
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
17458
ns17541
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
18250
ns18000
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
16812.5
ns17000
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
144830
ns148248.5
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
5439912
ns5730909.5
ns0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal
448250
ns745333
ns0.60
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
231562
ns232902
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
24667
ns26937
ns0.92
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26229.5
ns26291.5
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27083
ns27291
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
24833
ns26833.5
ns0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
963572.5
ns995021
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
40959335
ns39941943
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal
6046333
ns5463292
ns1.11
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
687187
ns692327
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
32208.5
ns10375
ns3.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
38208
ns11875
ns3.22
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
43375
ns12562
ns3.45
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
31459
ns11625
ns2.71
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
138477.5
ns125968
ns1.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
3597639.5
ns3534875
ns1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal
880000
ns849958
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
243662
ns236132
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
23270.5
ns22292
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
23917
ns21542
ns1.11
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
25145.5
ns23416
ns1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
22645.5
ns22459
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
705404
ns709781
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
22376022
ns21081902.5
ns1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal
5486750
ns5312812.5
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
671427
ns671626
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
63271
ns63000
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
64396
ns64875
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
66666
ns67624.5
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
63375.5
ns70792
ns0.90
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
106695.5
ns108732
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3482035
ns3570568
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1328458
ns463166.5
ns2.87
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
236317
ns233653
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
437479.5
ns437250
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
464312.5
ns448250
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
451499.5
ns451208
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
485145.5
ns443667
ns1.09
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
511151
ns523839.5
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
20924955
ns20377781.5
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6149750
ns6056791
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
716967
ns715783
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7542
ns7104.5
ns1.06
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7375
ns8125
ns0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8500
ns8333
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7125
ns7729.5
ns0.92
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
142876.5
ns147799
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
5717282
ns5614298
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal
463208.5
ns704750
ns0.66
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
64690
ns65321
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
12958
ns14500
ns0.89
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13812
ns15437.5
ns0.89
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14417
ns14833
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15458
ns14146
ns1.09
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
934056
ns966324
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
41113899
ns36660688
ns1.12
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal
5680771
ns5256874.5
ns1.08
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
396764
ns400984
ns0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
6145625
ns6153708
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
6375834
ns6380458
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
6379875
ns6380979.5
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
11908958
ns11947959
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
348241
ns301662
ns1.15
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU
302192.5
ns322583
ns0.94
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
19047770.5
ns19056521
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
19961208.5
ns19941000
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
19978625
ns19981146
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
36632228.5
ns36490833.5
ns1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1017536
ns1026590
ns0.99
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU
1157817
ns1153502
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
3208
ns917
ns3.50
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
3541
ns959
ns3.69
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
4084
ns1000
ns4.08
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
3250
ns958
ns3.39
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
30273
ns23570
ns1.28
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI
2057669
ns2101433
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal
335958
ns203000
ns1.65
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU
212322
ns207632
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
11417
ns3708
ns3.08
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
12291
ns3791
ns3.24
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
15000
ns3792
ns3.96
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
11459
ns3750
ns3.06
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
300887
ns284692.5
ns1.06
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI
11434302
ns11502827.5
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal
2150875
ns2063354
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
613366
ns625846
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
32583
ns7208
ns4.52
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
39625
ns8500
ns4.66
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
42125
ns9292
ns4.53
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
31291
ns8250
ns3.79
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
134275.5
ns122668.5
ns1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
3549585
ns3715127.5
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal
782479
ns787166
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
81161
ns72740
ns1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
17959
ns11875
ns1.51
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
19937.5
ns12750
ns1.56
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
20375
ns12583
ns1.62
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
18291
ns12500
ns1.46
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
653629
ns651999
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
21014605.5
ns22144306
ns0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal
4601291
ns4276208
ns1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
371429
ns359014
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
291
ns292
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
291
ns334
ns0.87
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22327
ns22720.5
ns0.98
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI
2086130.5
ns2075647.5
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal
324479
ns205083
ns1.58
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU
46841
ns47440
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
6791
ns2875
ns2.36
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
7208
ns3500
ns2.06
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
9375
ns3333
ns2.81
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
6667
ns3208
ns2.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
215371.5
ns206663
ns1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI
10232521.5
ns9232071
ns1.11
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal
1703500
ns1552875
ns1.10
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU
166071
ns156172
ns1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10167
ns10083
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
12875.5
ns11083
ns1.16
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
13125
ns12458
ns1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11083
ns11708
ns0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
120797.5
ns123476
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
3363540
ns3456473.5
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal
935500
ns861479.5
ns1.09
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
233122
ns236062
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
20709
ns20604
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21875
ns23187.5
ns0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21750
ns23333
ns0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
22625
ns21042
ns1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
590585
ns607311
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20620797
ns20290582.5
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal
4822000
ns4254667
ns1.13
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
648361
ns645431.5
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
6833
ns4458
ns1.53
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
7041
ns4500
ns1.56
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
7833
ns4417
ns1.77
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
6875
ns4500
ns1.53
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
31284
ns24732
ns1.26
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI
2217654
ns2177168
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal
229313
ns211459
ns1.08
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU
52301
ns47591
ns1.10
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
26125
ns16375
ns1.60
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
27209
ns16834
ns1.62
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
30000
ns16458
ns1.82
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
25834
ns16083
ns1.61
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
347032.5
ns332546.5
ns1.04
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI
13087814.5
ns12988178
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal
1080292
ns1511750
ns0.71
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU
216482.5
ns208322
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
3334
ns2084
ns1.60
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
3458
ns2041
ns1.69
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
3875
ns2167
ns1.79
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
3417
ns2209
ns1.55
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
41491.5
ns36551
ns1.14
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
1196910.5
ns1147028
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal
397958
ns268042
ns1.48
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
206202
ns204212
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
16917
ns17396
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
20833
ns17250
ns1.21
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21208
ns17812.5
ns1.19
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
17687.5
ns19479
ns0.91
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
288016
ns297836
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
21817074
ns21470855.5
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal
5201083
ns5022375
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
696531.5
ns686617
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
55459
ns56395.5
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
64896
ns65083
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
65583.5
ns66250
ns0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
51541.5
ns51333
ns1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66456
ns66767.5
ns1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU
113921
ns115211
ns0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
132500
ns197187.5
ns0.67
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
166374.5
ns163417
ns1.02
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
111500
ns163937.5
ns0.68
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
316833
ns315500
ns1.00
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
217912
ns219712.5
ns0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU
613066
ns611147
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
80625
ns105333
ns0.77
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
125645.5
ns81834
ns1.54
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
86146
ns86959
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82959
ns86750
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193130
ns191740.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5692161
ns5593567.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1989666.5
ns2535645.5
ns0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
216662.5
ns204172
ns1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1912792
ns1915521
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1921187.5
ns1914333
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1912375
ns1911750
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1908917
ns1879292
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
526124
ns538609
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
27266064
ns24792062.5
ns1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
8680374.5
ns8911395.5
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1069560
ns1067201
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
2375
ns292
ns8.13
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
2792
ns292
ns9.56
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
3459
ns292
ns11.85
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
2375
ns333
ns7.13
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
28282
ns22127
ns1.28
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI
2020733
ns2111782
ns0.96
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal
355667
ns320417
ns1.11
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU
46231
ns41970
ns1.10
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
9625
ns1792
ns5.37
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
13459
ns1875
ns7.18
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
14166
ns1875
ns7.56
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
9625
ns1875
ns5.13
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
270635.5
ns255417.5
ns1.06
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI
9869416
ns10493115
ns0.94
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal
1067437.5
ns1487041
ns0.72
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU
195496.5
ns183032
ns1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7958
ns7375
ns1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9854
ns9562.5
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
11000
ns11250
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
7521
ns11333
ns0.66
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
116502.5
ns121634
ns0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
3478612
ns3330370
ns1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal
901250
ns831000
ns1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
233553
ns235863
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8500
ns8958
ns0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10042
ns10917
ns0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10208
ns11542
ns0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8708
ns9250
ns0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
518097.5
ns536196
ns0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
21197993
ns20906072
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal
4329250
ns3661104.5
ns1.18
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
626366
ns620146.5
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
63291
ns56833
ns1.11
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
58084
ns46333
ns1.25
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
57292
ns47000
ns1.22
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
89791
ns83417
ns1.08
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
50283
ns40185
ns1.25
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1570099.5
ns1391043
ns1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1167250
ns1150167
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
84641
ns77886
ns1.09
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1912667
ns1925959
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1975417
ns1932875
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1966250
ns1975666
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1870792
ns1853417
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
232632
ns224336
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
33812417
ns33169959
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11151146
ns11254125
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1177471
ns1176553
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
415042
ns416209
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
419333.5
ns418021.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
422000
ns423500
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
417187.5
ns417709
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
207638.5
ns212391.5
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7531726.5
ns7928224
ns0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
542958.5
ns501042
ns1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
282777.5
ns283733
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
667541
ns689875.5
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
748979
ns744770.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
673708
ns684250
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
675250
ns683020.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1040445
ns1071393
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
44753534
ns45538634
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6673666.5
ns6134687.5
ns1.09
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
908713
ns911264.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
3514375
ns3426041.5
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
3451021
ns3415458.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
3440750
ns3440084
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
3449792
ns3459083
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
184364
ns174794
ns1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7909340
ns8045126
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1385709
ns1391250
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
425164
ns426850
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
6177354
ns6168667
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
6248791
ns6210416
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
6199834
ns6205709
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
6163750
ns6247562.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
983055
ns1017240
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
52497500
ns50293396
ns1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
8007792
ns7732791.5
ns1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1641310.5
ns1542501
ns1.06
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
474292
ns473291
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
345209
ns342875
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
346500
ns341396
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
905500
ns901791
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
54357
ns46836
ns1.16
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI
389107.5
ns381391
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal
404541
ns354270.5
ns1.14
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU
247753
ns243143
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2334208
ns2332208
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
2038708.5
ns2034354.5
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
2043542
ns2036500
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3293041.5
ns3194416
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
278906.5
ns273644.5
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI
8914943
ns15628377
ns0.57
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal
2088084
ns2136645.5
ns0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
756552
ns772838
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
62291
ns56292
ns1.11
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
58500
ns45834
ns1.28
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
56958
ns46125
ns1.23
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
89250
ns83209
ns1.07
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
38171
ns28601
ns1.33
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1017793
ns1335147
ns0.76
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1166417
ns1124979
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
86475.5
ns74305.5
ns1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2035708
ns2016104.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2102479
ns2087291
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2080937.5
ns2087917
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2008875
ns1975958.5
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
245090
ns240545
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
38180250
ns37474096
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11967000
ns11883709
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1207986.5
ns1048951
ns1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57709
ns56542
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
48084
ns46354.5
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
49000
ns46666.5
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83667
ns83750
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
56367
ns50752
ns1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
752295
ns835807
ns0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1087374.5
ns1048667
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
80301
ns78556
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1917209
ns1921000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1944583
ns1952958.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1961125
ns1973000
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1894375
ns1862417
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
246493.5
ns246729
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
16926918.5
ns16959227
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9855333.5
ns9957875
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1034015
ns1034211
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1375
ns292
ns4.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1667
ns416
ns4.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1959
ns416
ns4.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1375
ns292
ns4.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
39676
ns35694
ns1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
1198863
ns1211794.5
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal
286209
ns311771
ns0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
48840
ns46570
ns1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7375
ns6604.5
ns1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9083
ns7291.5
ns1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9479.5
ns6666
ns1.42
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7583
ns6709
ns1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
213194.5
ns213644
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
20651072
ns21642370
ns0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal
4692500
ns4349083.5
ns1.08
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
380024
ns366543.5
ns1.04
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
291
ns291
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
291
ns292
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32533
ns32948
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI
1268230
ns1191915
ns1.06
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal
254979.5
ns153792
ns1.66
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU
37420
ns39081
ns0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
6042
ns3208
ns1.88
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
7083
ns3041
ns2.33
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
9333
ns3083
ns3.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
6083
ns3083
ns1.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
199543
ns193915
ns1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI
8717018
ns7217530
ns1.21
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal
950520.5
ns894250
ns1.06
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU
164831
ns158472
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
437749.5
ns420583.5
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
487292
ns420833.5
ns1.16
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
466021
ns456166.5
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
442021
ns426229
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
143480
ns140216.5
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5976577
ns6258248
ns0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
2179687.5
ns2682604
ns0.81
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
370168.5
ns367294
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3794500
ns3811479
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3803417
ns3798000
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3791458
ns3806125
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3801709
ns3813437.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
707529.5
ns724543
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
35214956
ns32785400
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
10857667
ns10852833
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1463934
ns1313993.5
ns1.11
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
49798563
ns49807062.5
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
35524209
ns35521583
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
35534958
ns35517479
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
97214791.5
ns97112834
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1600126
ns1611615
ns0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU
1047610
ns1049140
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
153739771
ns153740041.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
112306958.5
ns112306083
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
112388250
ns112476667
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
294975583
ns295356541
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6485489.5
ns6485483
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU
5559847
ns5555702
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
21209
ns15041.5
ns1.41
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
20792
ns18375
ns1.13
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
20667
ns16083
ns1.29
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
23334
ns15646
ns1.49
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
23699
ns21271
ns1.11
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI
1173705
ns1120492.5
ns1.05
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal
222416.5
ns200000
ns1.11
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU
28521
ns27480
ns1.04
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
11500
ns10666.5
ns1.08
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
10000
ns9042
ns1.11
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
10375
ns9437.5
ns1.10
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
18416
ns17042
ns1.08
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
259109.5
ns267724
ns0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI
10048978
ns10072145
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal
1578917
ns1541750
ns1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU
147201.5
ns148171
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
24625
ns7709
ns3.19
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
27667
ns8709
ns3.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
30500
ns10708
ns2.85
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
23937.5
ns9708.5
ns2.47
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
137987.5
ns129031
ns1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
3369462
ns3486446
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal
670209
ns797791
ns0.84
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
243177.5
ns234732
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10917
ns10458.5
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
11500
ns9833
ns1.17
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11750
ns11333.5
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10792
ns9125
ns1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
621051
ns638866
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
22648297
ns21816663
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal
4704896
ns4208187.5
ns1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
650846
ns651461.5
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
8041
ns8625.5
ns0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10271
ns9729
ns1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
11125
ns11521
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9292
ns11042
ns0.84
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
119985.5
ns123974
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
3391312.5
ns3315044
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal
895208
ns859750
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
71901
ns72471
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13354
ns17583
ns0.76
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13667
ns13458
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13917
ns15166
ns0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13708
ns13083
ns1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
585616
ns608117
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20399248
ns18976850.5
ns1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal
4221708
ns3989167
ns1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
351904
ns346933
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1542
ns541
ns2.85
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1750
ns625
ns2.80
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1792
ns625
ns2.87
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1583
ns584
ns2.71
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
40136
ns35726
ns1.12
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
1237195
ns1170850
ns1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal
273959
ns255917
ns1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
207332
ns204512
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8750
ns8604.5
ns1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9250
ns7625
ns1.21
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9291
ns9250
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8875
ns7584
ns1.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
227150.5
ns237837
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
22981390
ns23133813.5
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal
4712916
ns4454021
ns1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
674086
ns654907
ns1.03
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
17875
ns12208
ns1.46
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
19167
ns16208
ns1.18
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
18896
ns15542
ns1.22
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
18125
ns10229
ns1.77
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
24199.5
ns22887
ns1.06
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI
1135581
ns1146280
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal
208625.5
ns183250
ns1.14
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU
187926.5
ns190602
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
32417
ns31917
ns1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
32958
ns32334
ns1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
33458
ns32334
ns1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
32625
ns31792
ns1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
275193
ns282370
ns0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI
11054114
ns12675054
ns0.87
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal
1674271
ns1664375
ns1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
588556
ns592261
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
455833.5
ns445708
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
470416.5
ns440416
ns1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
445500
ns446125
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
442125
ns462250
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194972.5
ns194079.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6233916
ns6009981
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
2002875
ns1948750
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
368743
ns368473
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3826416.5
ns3828708
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3821625
ns3827249.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3805291.5
ns3829459
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3828770.5
ns3834708
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
539774
ns555671
ns0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
27246937
ns28291601.5
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9665250
ns9332833
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1360323
ns1362449
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
787624562.5
ns836902583.5
ns0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
541996916
ns545812333
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
539785459
ns552742958
ns0.98
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
1557728417
ns1515431791
ns1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22543125
ns22773250.5
ns0.99
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU
14726018
ns14681704
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
2518400750
ns3618929167
ns0.70
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1785169708
ns1786520209
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1784676208
ns1811380625
ns0.99
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
5268664750
ns4749890834
ns1.11
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
367578104
ns371829328
ns0.99
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU
88737971
ns89064682
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
75084
ns75813
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
76541.5
ns76708
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
78958
ns79437
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
75625
ns76979
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
206590
ns213831.5
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7826047.5
ns7889207
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
947916
ns504291
ns1.88
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
120271
ns107541
ns1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
193042
ns268729
ns0.72
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
278584
ns283625
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
194458
ns204145.5
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
249250
ns192875
ns1.29
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1038440
ns1071904.5
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
43499026
ns42887765
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6277083
ns5838812.5
ns1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
658001
ns632041
ns1.04
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
199276312.5
ns199435500
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
139271583
ns139086375
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
139246333
ns139238083
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
388477666
ns389003125
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5836579.5
ns5834940
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU
3573103
ns3577266
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
619375645.5
ns616747896
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
439498458
ns438910291
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
439699604.5
ns439344770.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
1187020083
ns1178749375
ns1.01
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
26508453
ns26592537.5
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU
22071416
ns22013573
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
13833
ns7292
ns1.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
13292
ns6291
ns2.11
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
13625
ns6250
ns2.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16334
ns9959
ns1.64
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
37105
ns28590.5
ns1.30
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1214237
ns1242816
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
682166
ns342708
ns1.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
56160
ns46790
ns1.20
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
219750
ns214875
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
228708
ns220542
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229666.5
ns223250
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213125
ns207000
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
233596
ns227888
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
32648557
ns32088566
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
9102583
ns9056958
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
556036
ns532636
ns1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
8625
ns7500
ns1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9083.5
ns8459
ns1.07
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10416
ns11166
ns0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8125
ns10125
ns0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
116194
ns120432.5
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
3482048
ns3400864
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal
900041.5
ns833917
ns1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
73561
ns69170
ns1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7583
ns11687
ns0.65
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8084
ns7875
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8208
ns9083
ns0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7708
ns7791.5
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
515823
ns540200
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
19542020
ns19905821.5
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal
4141083
ns3738000
ns1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
319483
ns316443
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6709
ns500
ns13.42
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7000
ns500
ns14
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7250
ns583
ns12.44
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6833
ns500
ns13.67
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
35385
ns26859
ns1.32
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
1225507.5
ns1218948
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal
317896
ns487291.5
ns0.65
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
58250
ns46600
ns1.25
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15083
ns12042
ns1.25
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16500
ns9500
ns1.74
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17375
ns10666
ns1.63
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15583
ns9375
ns1.66
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
263078
ns259067.5
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
23218391
ns22720833.5
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal
5365146
ns5032208
ns1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
398084
ns388914
ns1.02
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
111604
ns105209
ns1.06
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
106999.5
ns98958.5
ns1.08
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
111125
ns100666
ns1.10
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
158562.5
ns146584
ns1.08
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
27181
ns26010
ns1.05
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI
1210920.5
ns1202311.5
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal
268208
ns239416
ns1.12
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU
193052
ns191122
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
479520.5
ns478959
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
510437.5
ns490458
ns1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
480729
ns483458
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
479354.5
ns519792
ns0.92
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
233277
ns238157
ns0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI
11412765
ns11712742
ns0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal
2209500
ns2063166.5
ns1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
604431
ns609226.5
ns0.99
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
5021
ns5459
ns0.92
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
5708.5
ns6937.5
ns0.82
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
6333.5
ns6708
ns0.94
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
6625
ns4479
ns1.48
batchedmm(16, Bsize=32)/forward/GPU/CUDA
16031
ns17171
ns0.93
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU
84920
ns84830
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
12729.5
ns12709
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
11646
ns11208.5
ns1.04
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
12146
ns11979.5
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
17375
ns16792
ns1.03
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
216325.5
ns219500
ns0.99
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU
366004
ns367374
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
35312.5
ns35250
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
51479
ns51958
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
53042
ns53333
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
13667
ns13792
ns0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA
21712
ns22473
ns0.97
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU
91931
ns87211
ns1.05
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
37354.5
ns37208
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
44104
ns30979
ns1.42
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
32958
ns32729.5
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
57917
ns57375
ns1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
194626.5
ns198883
ns0.98
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU
399414
ns411165
ns0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
8542
ns1708
ns5.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
9791.5
ns1917
ns5.11
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
11625
ns2208
ns5.26
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
9750
ns2020.5
ns4.83
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
23397
ns20890
ns1.12
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI
1258960
ns1182894
ns1.06
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal
305375
ns198895.5
ns1.54
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU
34271
ns34491
ns0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
3041
ns2250
ns1.35
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
3271
ns2125
ns1.54
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
3792
ns2541
ns1.49
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
3208
ns2375
ns1.35
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
206318
ns209350.5
ns0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI
8856545.5
ns9223044
ns0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal
1504750.5
ns1571458
ns0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU
141011.5
ns137241
ns1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4792
ns3979.5
ns1.20
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4708.5
ns4916
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6834
ns6167
ns1.11
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4667
ns5562.5
ns0.84
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
141147.5
ns148854.5
ns0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
5871936.5
ns5416916
ns1.08
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal
457167
ns433541
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
68731
ns69351
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8458.5
ns8958
ns0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8459
ns8584
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8750
ns9375
ns0.93
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8333
ns8208
ns1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
861183
ns901778
ns0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
37057824
ns39101068.5
ns0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal
5555937.5
ns5296271
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
385044
ns390164
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
58083
ns56792
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
59084
ns57792
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
59416
ns57667
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
59416
ns58625
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
43710
ns38676
ns1.13
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1203059
ns1256024
ns0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
532666
ns328000
ns1.62
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
207252
ns204982
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
449104.5
ns454396
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
465666.5
ns464875
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
467437
ns465042
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
435520.5
ns433750
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
264164
ns274516.5
ns0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
27468186
ns27766998
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
8246875
ns7963542
ns1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
831528
ns840618
ns0.99
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
3290708
ns3290875
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
2334854.5
ns2340916.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
2339729
ns2344208.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
6308458
ns6314083.5
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
204167
ns205766
ns0.99
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU
218552
ns213542
ns1.02
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
11346209
ns11352771
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
8328312.5
ns8308208
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
8321834
ns8331229.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
21080084
ns21159458.5
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
728462
ns735602
ns0.99
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU
1058000
ns1058910.5
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5083.5
ns3542
ns1.44
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6875
ns6646
ns1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7083
ns7333
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6604
ns6875
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
136287.5
ns141882
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
5642827
ns5384644
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal
783520.5
ns792000
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
55800
ns56381
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7000
ns9458
ns0.74
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7417
ns7583.5
ns0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7375
ns7250
ns1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7291.5
ns7458
ns0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
747674.5
ns774451.5
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
35501443
ns37102116
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal
5585312.5
ns5116062.5
ns1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
365323
ns368734
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
110750
ns95500
ns1.16
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
127458.5
ns95041
ns1.34
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
122542
ns101334
ns1.21
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
117167
ns96958
ns1.21
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
156753
ns153183
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6003618
ns5925151
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
2136000
ns2007167
ns1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
226292.5
ns218112
ns1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2021500
ns2021874.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2022042
ns2010334
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2031021
ns2025458
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2023917
ns2005917
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
706711
ns723141
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31552204.5
ns33170321
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
10690542
ns10803562.5
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1254492
ns1255352
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
28833.5
ns29750
ns0.97
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
36542
ns36291.5
ns1.01
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
34917
ns35000
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
708.5
ns708
ns1.00
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15392
ns15831
ns0.97
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU
79601
ns80041
ns0.99
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
3250
ns3417
ns0.95
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
3833
ns3000
ns1.28
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
3917
ns2958
ns1.32
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2834
ns2292
ns1.24
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
139825
ns144997
ns0.96
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU
340743
ns345563
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
8416
ns7167
ns1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
7333
ns6208
ns1.18
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
7541
ns6042
ns1.25
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
11208
ns10458
ns1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
42506
ns37804.5
ns1.12
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1167392.5
ns1127358
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
420187.5
ns324750
ns1.29
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
50571
ns48830
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
213521
ns213833
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
229958
ns221229
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
222791.5
ns220667
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215375
ns206167
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
251022
ns251783
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
25955253.5
ns25462835
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7930584
ns7855917
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
574850
ns579016
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
6209
ns3917
ns1.59
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
6375
ns3958
ns1.61
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
6459
ns3917
ns1.65
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
6125
ns4167
ns1.47
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
28584
ns22588
ns1.27
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI
2152846
ns2083671
ns1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal
251125
ns226542
ns1.11
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU
47090
ns42771
ns1.10
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
23167
ns14916
ns1.55
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
24166
ns15083
ns1.60
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
24375
ns14916
ns1.63
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
23292
ns14792
ns1.57
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
333019.5
ns316521
ns1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI
11744061
ns11265875.5
ns1.04
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal
1014854.5
ns963479.5
ns1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU
208872
ns193022
ns1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
110000.5
ns101709
ns1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
148604
ns99958
ns1.49
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
126750
ns106041
ns1.20
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
133125
ns102208
ns1.30
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
148515
ns142614
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5739795.5
ns5689078
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
2080104
ns2045292
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
217122
ns214192
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1912521
ns1924667
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1906583
ns1842979
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1884312.5
ns1918292
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1920187.5
ns1901125
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
696456
ns707209
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31821334
ns31631954.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
10487959
ns10461667
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1218296
ns1220282
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17542
ns16604
ns1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
22458
ns18813
ns1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
20771
ns21271
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18771
ns18291
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
112142.5
ns111618
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3422627
ns3369345
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1340625
ns464208
ns2.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
80871
ns80435.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
215875
ns216042
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
253583
ns217458
ns1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
217667
ns216708.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
216417
ns216395.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
525953.5
ns534644
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
19492559.5
ns19551285.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6121020.5
ns6104084
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
476639.5
ns481515
ns0.99
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
23979.5
ns23416.5
ns1.02
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
32625
ns30395.5
ns1.07
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
28250
ns28583
ns0.99
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
1666.5
ns1250
ns1.33
batchedmm(16, Bsize=4)/forward/GPU/CUDA
16428
ns16607
ns0.99
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU
81141
ns81651
ns0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
5271
ns4729.5
ns1.11
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
5854.5
ns4916.5
ns1.19
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
6437.5
ns5104.5
ns1.26
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
5646
ns4875
ns1.16
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
215206.5
ns212757
ns1.01
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU
379243.5
ns378384
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
303000
ns303792
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
305416.5
ns306416.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
308771
ns308125
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
305083
ns306917
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
231043
ns235352.5
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
8121882
ns7753901
ns1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1184000
ns895000
ns1.32
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
272543
ns273893
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
529833.5
ns532500
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
567729.5
ns561375
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
533292
ns533875
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
536958.5
ns538042
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1091736
ns1115910
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
45445443.5
ns43545460
ns1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6208000
ns5736646
ns1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
868528
ns855458
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
36042
ns18500
ns1.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
39083
ns23125
ns1.69
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
42458
ns20875
ns2.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
37041
ns20250
ns1.83
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
131591
ns117298.5
ns1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3738074.5
ns3644245
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
1464375
ns475438
ns3.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
87560
ns79291
ns1.10
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
215250.5
ns213125
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
215104.5
ns227959
ns0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
215917
ns214479.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
219083.5
ns212750
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
768516
ns769273
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
24673008
ns26817998
ns0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7384104
ns7163750
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
532425
ns536785
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5500
ns5292
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7000
ns6979
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8542
ns8458.5
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6334
ns6958
ns0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
140673
ns144689.5
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
5635164.5
ns5674338
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal
772083
ns763958
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
67510
ns65951
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10271
ns9833
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10292
ns10395.5
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10833.5
ns9875
ns1.10
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10875
ns10166
ns1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
833045.5
ns843305.5
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
38670510
ns40229475
ns0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal
5336083
ns5021354
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
390258.5
ns388453.5
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4709
ns5083
ns0.93
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5500
ns5645.5
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6709
ns7354
ns0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6542
ns7459
ns0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
144256
ns148525.5
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
5567924
ns5807141
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal
792979
ns768729
ns1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
66931
ns67441
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7125.5
ns7459
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7917
ns7750
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7917
ns7583
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7604.5
ns7291
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
790107
ns806597
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
39023151
ns38873703
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal
5547667
ns5499042
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
391578.5
ns394693
ns0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
14365625
ns14393541
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
10109792
ns10086042
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
10132375
ns10132625
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
27659333
ns27847083
ns0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA
534508
ns531501
ns1.01
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU
392324
ns400094
ns0.98
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
45855833
ns45837667
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
33506395.5
ns33412125
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
33525958
ns33550792
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
85233208
ns85694750
ns0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2804828.5
ns2655274
ns1.06
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU
3316671
ns3296132
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
83646
ns65750
ns1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
87875
ns69354
ns1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
90333
ns68834
ns1.31
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
85687.5
ns67708
ns1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
124763.5
ns125224.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3719605
ns3321446
ns1.12
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
1478042
ns478792
ns3.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
248002.5
ns228082
ns1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
442062
ns442083
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
451167
ns452104
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
444167
ns442208
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
441479
ns444791
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
747087
ns744155
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26627384
ns26781484
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7697145.5
ns7548250
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
784227
ns785568
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
1750
ns500
ns3.50
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
1875
ns584
ns3.21
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
2000
ns583
ns3.43
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
1750
ns541
ns3.23
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
38564
ns33459
ns1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
1190356
ns1181669
ns1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal
469896
ns266750
ns1.76
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
50030
ns47690
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10250
ns9104.5
ns1.13
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10938
ns8958
ns1.22
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11042
ns9375
ns1.18
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10625
ns8333
ns1.28
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
286642
ns292729
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
21365525.5
ns21877451
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal
4815583.5
ns4421083
ns1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
389993
ns376084
ns1.04
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
9833
ns9834
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
9834
ns9792
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
9833
ns9833
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
9792
ns9834
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
23280
ns23819
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI
2038143
ns1943243
ns1.05
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal
228792
ns211083
ns1.08
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU
204342
ns209072
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
49584
ns45958
ns1.08
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
50542
ns46375
ns1.09
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
50708
ns46167
ns1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
49917
ns45542
ns1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
308151
ns297740
ns1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI
9437098
ns13019378
ns0.72
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal
1545500
ns1008520.5
ns1.53
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
603836
ns610991
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
62834
ns56250
ns1.12
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
64292
ns57125
ns1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
64333
ns57125
ns1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
64250
ns57708.5
ns1.11
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
39886
ns29558.5
ns1.35
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1257152
ns1212552
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
638041.5
ns345084
ns1.85
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
213412
ns204882
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
456084
ns449291.5
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
488791.5
ns482958
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
476146
ns465791
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
491750
ns434625
ns1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
263616
ns253081.5
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
32507864
ns31946764
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
9629125
ns9299875.5
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
891718
ns887358.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
638875
ns639500
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
657062.5
ns610791
ns1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
647917
ns650021
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
637833
ns613396
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
209655
ns213054.5
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8362863.5
ns8304459
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1377917
ns1377667
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
308858
ns314248
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2231042
ns2230375
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2234709
ns2241083
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2231770.5
ns2226458
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2224542
ns2044000
ns1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
969019
ns1009323.5
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
48212492
ns48595808
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
7164667
ns10250250
ns0.70
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1319082
ns1209503
ns1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
36750.5
ns18583
ns1.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
40083
ns21500
ns1.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
42416
ns22084
ns1.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
36146
ns20333
ns1.78
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
131167.5
ns115629.5
ns1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3717220.5
ns3530676
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
1489541.5
ns529396
ns2.81
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
89901
ns79871
ns1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221125
ns219583.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
231999.5
ns228750
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
223062.5
ns221395.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
220250
ns219500
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
745440.5
ns743488
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
23316075.5
ns26086313.5
ns0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7764958
ns7436521
ns1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
549685
ns556135
ns0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6833
ns500
ns13.67
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7208
ns584
ns12.34
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7375
ns584
ns12.63
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6834
ns500
ns13.67
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
33512
ns24005
ns1.40
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
1223090
ns1194343
ns1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal
444542
ns283521
ns1.57
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
57271
ns47860
ns1.20
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15833.5
ns9979
ns1.59
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17146
ns10542
ns1.63
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17041
ns9687.5
ns1.76
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
16687.5
ns9916.5
ns1.68
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
282963.5
ns274665.5
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
24155856
ns25054245
ns0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal
5994417
ns4901583
ns1.22
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
408498.5
ns403794
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
9458.5
ns7750
ns1.22
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9167
ns8541
ns1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10416.5
ns9458
ns1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8541.5
ns10041
ns0.85
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
120739.5
ns122963.5
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
3418426
ns3342683
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal
888833.5
ns828959
ns1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
70321
ns70460
ns1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7500
ns7583
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7667
ns7875
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7895.5
ns7917
ns1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7417
ns7208
ns1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
513454.5
ns521824.5
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
17140740
ns17096205
ns1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal
3973625
ns3622437.5
ns1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
319713
ns323444
ns0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
9354
ns1375
ns6.80
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
9542
ns1708
ns5.59
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
10583
ns1875
ns5.64
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
9229
ns1584
ns5.83
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
24142
ns22394
ns1.08
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI
1195519
ns1154621
ns1.04
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal
305208
ns310833
ns0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU
190361
ns190371.5
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4145.5
ns3209
ns1.29
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4208
ns3333
ns1.26
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4750
ns3583
ns1.33
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4167
ns3500
ns1.19
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
226431.5
ns224060
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10249888.5
ns9920013
ns1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal
1679312.5
ns1731417
ns0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
577155
ns581006
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
155083
ns145687
ns1.06
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
136375
ns128584
ns1.06
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
140958
ns129625
ns1.09
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
232833.5
ns226167
ns1.03
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
26998
ns25004
ns1.08
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI
1222721.5
ns1165561.5
ns1.05
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal
297875
ns248959
ns1.20
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU
42431
ns40870
ns1.04
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
144458
ns143604
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
127291
ns130083
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
112104.5
ns111208
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
252250
ns251937.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
219049
ns224391
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI
10976395
ns10232573
ns1.07
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal
2074312
ns1955250
ns1.06
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU
265923
ns267492
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
8583
ns7208
ns1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
7292
ns6083
ns1.20
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
7292
ns6000
ns1.22
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
11333
ns10458
ns1.08
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
38422
ns34049
ns1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1208855
ns1180224
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
374313
ns325584
ns1.15
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
51281
ns50630
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221417
ns219688
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
229791
ns237125
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
230458.5
ns228500
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214041.5
ns212875
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
259283
ns270641
ns0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
27623036
ns29882407
ns0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
8241896
ns8193250
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
592306
ns592361
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
15479
ns14125
ns1.10
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
15375
ns15291.5
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
17458
ns16792
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
15542
ns16000
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
137835
ns143262
ns0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
5606741
ns5352196.5
ns1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal
778728.5
ns756916.5
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
231852
ns233592
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
23417
ns23895.5
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
23791
ns24041.5
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24000
ns23542
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
23937
ns23667
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
858271
ns888831
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
38888220
ns38279760.5
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal
5635500
ns5301166.5
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
677086
ns679602
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
26604
ns8875
ns3.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
28250
ns9250
ns3.05
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
31333
ns11313
ns2.77
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
26812.5
ns9834
ns2.73
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
137010
ns126441
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
3519222
ns3425975
ns1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal
925417
ns886021
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
82411
ns73581
ns1.12
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14792
ns14000
ns1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15708
ns14166.5
ns1.11
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16000
ns14541
ns1.10
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15645.5
ns13875
ns1.13
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
668142
ns686454
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
20824375
ns21159530.5
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal
5325770.5
ns5057854
ns1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
366524
ns368623
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9312.5
ns6833
ns1.36
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9416
ns9645.5
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10583
ns10959
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9542
ns9125
ns1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
121280
ns125289
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
3386034.5
ns3340336.5
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal
932375
ns858667
ns1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
72561
ns73441
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12354
ns12750
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13000
ns12875
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13042
ns12959
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12458.5
ns12584
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
545614
ns568824
ns0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
19231212.5
ns20335817
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal
4752396
ns4008167
ns1.19
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
340553.5
ns341833
ns1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
26958
ns26604
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
34792
ns35042
ns0.99
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
32041.5
ns31437.5
ns1.02
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
1958.5
ns1958
ns1.00
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16169
ns16488
ns0.98
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU
80481
ns80881
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
6042
ns5354
ns1.13
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
6208
ns5271
ns1.18
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
6520.5
ns5375
ns1.21
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
6834
ns6417
ns1.06
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
141884.5
ns144829.5
ns0.98
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU
371004
ns371354
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6458
ns250
ns25.83
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6834
ns417
ns16.39
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6875
ns375
ns18.33
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6375
ns334
ns19.09
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
34623
ns26201
ns1.32
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
1293519
ns1213684
ns1.07
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal
457312.5
ns435084
ns1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
56171
ns47131
ns1.19
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
12916
ns6417
ns2.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
13791
ns6666
ns2.07
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
14084
ns6708
ns2.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
13042
ns6541
ns1.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
198569
ns192082.5
ns1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
24424265
ns23595307
ns1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal
5453125
ns4957208
ns1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
396759
ns388663.5
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8292
ns1917
ns4.33
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8625
ns2000
ns4.31
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8833
ns2042
ns4.33
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
8333
ns1959
ns4.25
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
35748
ns26999
ns1.32
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
1189931
ns1208214.5
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal
324084
ns281958
ns1.15
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
215022
ns206222
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
22770.5
ns16312.5
ns1.40
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
23812.5
ns17020.5
ns1.40
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
24291.5
ns16562.5
ns1.47
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
22458
ns16437.5
ns1.37
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
284982
ns281291
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
26514332
ns25314200
ns1.05
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal
5718333
ns5387167
ns1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
709637
ns705642
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
149125
ns148250
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
155917
ns175104
ns0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
152500
ns154500
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
148250
ns148375
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
200827
ns210020
ns0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7732253
ns7920169
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1424250.5
ns1553375
ns0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
214342
ns236022
ns0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1322854.5
ns1326125
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1324334
ns1317625
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1306187.5
ns1267583
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1319750
ns1330208
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
894838
ns941055
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
47138720.5
ns46042204
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
6451042
ns9797270.5
ns0.66
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1104625
ns1107606
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
25541.5
ns23542
ns1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
25166
ns25167
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
27666
ns28437.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24084
ns24917
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
236708
ns241297.5
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7576228.5
ns7644187.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1207792
ns558625
ns2.16
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
114481
ns114946.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
117291.5
ns174646
ns0.67
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
119125.5
ns167916
ns0.71
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
119021
ns119708.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
129000
ns126750
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1066520
ns1108737
ns0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
44035334
ns45003191
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6154750
ns5870834
ns1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
614935
ns610886
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6417
ns250
ns25.67
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6750
ns417
ns16.19
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6875
ns375
ns18.33
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6458
ns250
ns25.83
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
32046
ns23373.5
ns1.37
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
1188985
ns1207385.5
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal
304791.5
ns274541
ns1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
56421
ns47321
ns1.19
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
12958
ns6458
ns2.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
13958
ns6708
ns2.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
14104
ns6625
ns2.13
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
12979.5
ns6521
ns1.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
219681.5
ns207930.5
ns1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
24127200
ns24020738
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal
5367125
ns5321979
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
404804
ns394454
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6042
ns5125
ns1.18
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6958
ns6000
ns1.16
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
8000
ns7375
ns1.08
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5812.5
ns5500
ns1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
143745
ns148415.5
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
5574883
ns5743209.5
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal
721833
ns438042
ns1.65
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
232722
ns233753
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9875
ns9708.5
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10417
ns10500
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10375
ns10292
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10083.5
ns10000
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
893962
ns921993
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
40305685.5
ns40800221
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal
6022625
ns5516833
ns1.09
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
667866
ns673881.5
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
666
ns625
ns1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
666
ns625
ns1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
666
ns666
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
667
ns667
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
22221.5
ns22961
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI
2080787.5
ns2040345
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal
253958
ns205708
ns1.23
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU
206192
ns207722.5
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
7958
ns4625
ns1.72
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
8833
ns4958
ns1.78
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
8875
ns4792
ns1.85
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
8041
ns4625
ns1.74
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
238671.5
ns232829.5
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10055023.5
ns11262701.5
ns0.89
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal
1611250
ns1643083.5
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
575715
ns580356
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
24208
ns8166
ns2.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
26562.5
ns8250
ns3.22
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
29458
ns9458
ns3.11
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
25313
ns8979.5
ns2.82
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
134686.5
ns124075.5
ns1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
3666858.5
ns3484097
ns1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal
819479.5
ns848979
ns0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
82871
ns73621
ns1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9833
ns8396
ns1.17
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10750
ns8584
ns1.25
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10584
ns9084
ns1.17
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9959
ns8334
ns1.19
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
592874
ns601403
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
20598079.5
ns21381887.5
ns0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal
4586229.5
ns4049604
ns1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
342583
ns345603
ns0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
125959
ns123354
ns1.02
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
129958
ns130833
ns0.99
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
130021
ns130292
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
181187.5
ns183083
ns0.99
batchedmm(128, Bsize=4)/forward/GPU/CUDA
45830
ns46276
ns0.99
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU
105671
ns100861
ns1.05
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
325125
ns331291
ns0.98
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
323667
ns336312.5
ns0.96
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
316417
ns332416.5
ns0.95
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
616792
ns584792
ns1.05
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
194713
ns195249
ns1.00
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU
508449.5
ns504285
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
400583
ns396500
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
290666
ns287958
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
291292
ns288167
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
759541
ns756292
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
51490
ns43813
ns1.18
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI
1452694
ns1397680
ns1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal
458875
ns359646
ns1.28
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU
84931
ns81271
ns1.05
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1458459
ns1447584
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1140687.5
ns1133917
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1149666.5
ns1135166.5
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2451791
ns2356062
ns1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
274619
ns251976
ns1.09
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI
10187838
ns10628240
ns0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal
1914208
ns1770646
ns1.08
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU
358283
ns350644
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
633666
ns641750
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
663666.5
ns660333
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
645687.5
ns656625
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
632541
ns541646
ns1.17
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
200663
ns206977
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8382472
ns8394592
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1352979.5
ns1331770.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
307532.5
ns313564
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2467667
ns2445250
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2454750
ns2456229
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2454500
ns2446833.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2451167
ns2483750
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
984218.5
ns1018661.5
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
50225251.5
ns53769994.5
ns0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
7766292
ns9019125
ns0.86
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1380642
ns1436974
ns0.96
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
32292
ns28875
ns1.12
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
36875
ns36438
ns1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
34000
ns34354
ns0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
958.5
ns833
ns1.15
batchedmm(2, Bsize=32)/forward/GPU/CUDA
15278.5
ns15679
ns0.97
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU
78690.5
ns79081
ns1.00
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
3792
ns3125
ns1.21
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
4333
ns3333
ns1.30
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
4583.5
ns3542
ns1.29
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
4124.5
ns3042
ns1.36
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
140987
ns141592
ns1.00
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU
336043
ns340828.5
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
413209
ns404000
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
415792
ns408458
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
416395.5
ns407958
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
427145.5
ns420750
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
54475
ns44015
ns1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1394063.5
ns1346061
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1198125
ns1099750
ns1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
250702
ns240182
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3877833
ns3854416
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3995771
ns3977416.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3886792
ns3995708.5
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3754728.5
ns3786812.5
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
255856
ns247915
ns1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
37405566.5
ns38628061.5
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11943833
ns11941666
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1432843
ns1249207.5
ns1.15
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3958
ns4000
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
33843
ns34055
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI
1197554
ns1242873
ns0.96
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal
177584
ns160875
ns1.10
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU
37790.5
ns38220
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
19500
ns15625
ns1.25
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
20083
ns15958
ns1.26
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
20375
ns15958
ns1.28
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
19708
ns15625
ns1.26
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
265715
ns257530
ns1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI
10200910.5
ns8798187
ns1.16
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal
870334
ns839395.5
ns1.04
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU
178112
ns167922
ns1.06
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
404500
ns403667
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
296167
ns295750
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
295667
ns295750
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
760375
ns760166
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
112966
ns113514
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI
1017004.5
ns1017055
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal
439666
ns326291.5
ns1.35
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU
87800
ns87391
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1477375
ns1472208
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1147125
ns1161500
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1158208
ns1160625
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2470770.5
ns2378291
ns1.04
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
253070
ns245391
ns1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI
8892806
ns10232371
ns0.87
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal
1857708
ns1858625
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU
354333
ns356813
ns0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6708
ns500
ns13.42
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7125
ns583
ns12.22
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7125
ns583
ns12.22
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6666
ns500
ns13.33
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
34849
ns26329.5
ns1.32
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
1195460
ns1165109.5
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal
444292
ns458750
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
216212
ns207592
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
14000
ns7458
ns1.88
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
15125
ns7958
ns1.90
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
15584
ns7833
ns1.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
14250
ns7500
ns1.90
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
223556.5
ns220362.5
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
27158587
ns24956286.5
ns1.09
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal
5345375.5
ns4949916.5
ns1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
696921.5
ns695677
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
832708
ns824979
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
618166
ns619166
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
611542
ns619291
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
1540812.5
ns1521750
ns1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA
130337.5
ns130530.5
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU
224742
ns228943
ns0.98
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
2662417
ns2673291.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
2007708
ns2003917
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
2003084
ns2004458
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
4932771
ns4938271
ns1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
261909.5
ns246670.5
ns1.06
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU
835813
ns761778
ns1.10
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1375
ns291
ns4.73
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1542
ns375
ns4.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1583
ns333
ns4.75
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1375
ns250
ns5.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
36888
ns32758
ns1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
1237057
ns1196400
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal
366667
ns263500
ns1.39
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
49661
ns46921
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7687.5
ns6542
ns1.18
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8542
ns6833
ns1.25
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8291
ns6667
ns1.24
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7958
ns6333
ns1.26
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
219381
ns229162.5
ns0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
20912774
ns21326390
ns0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal
4917833
ns4918333
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
375813
ns360398.5
ns1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2401916.5
ns2389042
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2401583
ns2375416
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2379416
ns2399208
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2371833
ns2395167
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
198341.5
ns205752
ns0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8113331.5
ns7986200
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
2274958
ns1428354
ns1.59
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
374084
ns375378.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4636458
ns4650833
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4653166.5
ns4663624.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4641125
ns4666416.5
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4652750
ns4657125
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
889968
ns922860
ns0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
46625557
ns50907571
ns0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
6404438
ns6979416.5
ns0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1356447.5
ns1386483.5
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
17208.5
ns13458.5
ns1.28
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
14583
ns7333
ns1.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
16313
ns7708
ns2.12
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
21229
ns6416.5
ns3.31
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
25470
ns23918
ns1.06
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI
1159611
ns1244282
ns0.93
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal
267750
ns235958
ns1.13
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU
42811
ns40260
ns1.06
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
45146
ns46271
ns0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
49833
ns63375
ns0.79
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
34417
ns52500
ns0.66
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
73000.5
ns33708.5
ns2.17
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
218060
ns220952
ns0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI
10466633
ns10877336.5
ns0.96
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal
2129250
ns1059416
ns2.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU
268402
ns264808
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
20459
ns20208.5
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
26208
ns25708
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
25292
ns24770.5
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
5333.5
ns5291
ns1.01
batchedmm(2, Bsize=512)/forward/GPU/CUDA
16594
ns17145
ns0.97
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU
83491
ns83681
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
12541
ns12646
ns0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
11375
ns10645.5
ns1.07
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
11625
ns10500
ns1.11
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
19084
ns18146
ns1.05
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
227944.5
ns230722.5
ns0.99
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU
370203
ns371984
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
409416
ns405208
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
299958
ns297166
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
300250
ns297541
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
765750
ns762459
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
53976
ns46892
ns1.15
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI
1356509
ns1423487.5
ns0.95
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal
442125
ns335000
ns1.32
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU
94470.5
ns88571
ns1.07
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1489667
ns1475875
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1171812
ns1169208
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1175459
ns1166834
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2480500
ns2378771
ns1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
311892
ns287503
ns1.08
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI
13648236
ns12647035
ns1.08
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal
2072208.5
ns2003291.5
ns1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU
370933
ns380444
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
435250
ns432000
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
438084
ns436541
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
437333
ns436708
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
448333
ns448208
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
61295
ns54845
ns1.12
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1050666
ns1004553
ns1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1135104
ns1035833
ns1.10
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
237222
ns234772.5
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3895917
ns3891459
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4001312.5
ns4027292
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3913375.5
ns4026478.5
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3807916.5
ns3684083
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
261286
ns268195
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
29924983
ns32271096.5
ns0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9972333
ns10269354.5
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1208741
ns1382008.5
ns0.87
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
11000
ns8750
ns1.26
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
10292
ns7667
ns1.34
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
10334
ns7667
ns1.35
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
14625
ns12417
ns1.18
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
30723
ns24204
ns1.27
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI
2169986
ns2100905
ns1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal
233208.5
ns211416
ns1.10
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU
215396.5
ns209352
ns1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52791
ns45042
ns1.17
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
53583
ns45791
ns1.17
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
54083.5
ns45208
ns1.20
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
53125
ns44959
ns1.18
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
366013
ns348332
ns1.05
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI
13310261
ns12300844.5
ns1.08
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal
1891437.5
ns1700187.5
ns1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
643336
ns655376
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
94209
ns121916.5
ns0.77
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
90833
ns144917
ns0.63
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
85958
ns88625
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
126167
ns105229.5
ns1.20
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
190399.5
ns189408.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5795468
ns5999999
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1996458
ns1936000
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
221047
ns220412
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2017500
ns2017208
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2011417
ns2018750
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1801333.5
ns2014000
ns0.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1978875
ns2017500
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
531205
ns544732
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
28343892
ns27836425
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9357625
ns9082333.5
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1089565
ns961460
ns1.13
This comment was automatically generated by workflow using github-action-benchmark.