This repository has been archived by the owner on Nov 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: rollback custom gelu implementation
- Loading branch information
Showing
2 changed files
with
1 addition
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "LuxLib" | ||
uuid = "82251201-b29d-42c6-8e01-566dec8acb11" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.3.0" | ||
version = "1.3.1" | ||
|
||
[deps] | ||
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6aad052
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
6aad052
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/116008
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
6aad052
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LuxLib Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5291
ns7625
ns0.69
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
7375
ns7333
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7687
ns7437
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6958
ns5500
ns1.27
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
111876
ns88183
ns1.27
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
2746993
ns2389684
ns1.15
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
414534
ns405334
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10041.5
ns9916.5
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10125
ns9542
ns1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10167
ns9792
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10000.5
ns10000
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
497187
ns383362
ns1.30
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
17740695
ns17679354
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
664206
ns677366
ns0.98
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1479.5
ns2334
ns0.63
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1459
ns1500
ns0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1875
ns1688
ns1.11
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1583.5
ns1729.5
ns0.92
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
19698
ns14281
ns1.38
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI
1364290
ns1297688
ns1.05
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU
31630
ns30200
ns1.05
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4083
ns4271
ns0.96
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4416
ns4458
ns0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4125
ns3750
ns1.10
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3291
ns3917
ns0.84
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
130509
ns106099.5
ns1.23
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI
9003854
ns9298154.5
ns0.97
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU
149371
ns144956.5
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57958
ns57333
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46167
ns46750
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46542
ns46250
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82541
ns83708
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
36502
ns30588.5
ns1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
564405
ns572856.5
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
81146
ns77970
ns1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2037625
ns2018916
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2078416
ns2087937.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2083625
ns2087229
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2000875
ns1997063
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
216924
ns182309
ns1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
7524779
ns7656207
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1725786
ns1482305
ns1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
152667
ns146584
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
168375
ns174667
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
152437.5
ns149333.5
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
193708
ns178791.5
ns1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
167125
ns167232
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7312313
ns9038666
ns0.81
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
213517
ns197432
ns1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1113104.5
ns1107750.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1116334
ns1114208
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1115000
ns1117604.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1106770.5
ns1114000.5
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
628256
ns537253
ns1.17
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
32195104
ns35616369
ns0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1026645
ns1026475
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5166
ns5291
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4792
ns4645.5
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5917
ns5541
ns1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4542
ns4166
ns1.09
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
82840
ns60281
ns1.37
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
5343488
ns5328970.5
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
67740
ns70560
ns0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8708
ns8562.5
ns1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8500
ns8459
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8708.5
ns9166.5
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8542
ns8750
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
548688
ns414715.5
ns1.32
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
33264338
ns33923657
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
384004
ns387834
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17709
ns17792
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18625
ns17708
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21375
ns21500
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
19583.5
ns17208.5
ns1.14
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
61770.5
ns60282.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3180292.5
ns3008486
ns1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
75881
ns75721
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212208
ns212333
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
219208.5
ns212458
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
214875
ns213521
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
219958
ns222750
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
324445
ns291687
ns1.11
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
13687318.5
ns14295306
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
466224
ns471954.5
ns0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
625
ns583
ns1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
625
ns583
ns1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
958
ns792
ns1.21
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
667
ns583.5
ns1.14
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
18677
ns13225
ns1.41
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI
1223151.5
ns1210151
ns1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU
31400
ns30961
ns1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1416.5
ns1541
ns0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1417
ns1542
ns0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1583
ns1542
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1416
ns1416.5
ns1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
114301
ns92964
ns1.23
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI
8986516.5
ns9171879
ns0.98
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU
135771
ns134891
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7292
ns7375
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6125
ns6125
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6125
ns6167
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9958
ns10125
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
23537.5
ns18616
ns1.26
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1250837
ns1243379.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
47255.5
ns46921
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
220688
ns263062
ns0.84
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
235896
ns240459
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229416
ns228792
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
255458.5
ns237750
ns1.07
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
180772.5
ns154023
ns1.17
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
30816816.5
ns32407548
ns0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
642475
ns637591
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4042
ns4083
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4084
ns4125
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4166
ns4167
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4083
ns4083
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
22833
ns20561
ns1.11
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI
2018204
ns2115667
ns0.95
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU
46910
ns46550
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16541
ns17125
ns0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16834
ns16750
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
17084
ns16958
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16833
ns16416
ns1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
182565
ns174545.5
ns1.05
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI
10544759
ns10156857.5
ns1.04
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU
171221
ns173982
ns0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
493041
ns509375
ns0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
385667
ns405541
ns0.95
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
386125
ns404292
ns0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
847250
ns864750
ns0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
112997
ns117562
ns0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI
408156.5
ns397557
ns1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU
242212
ns240702
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2093437.5
ns2318458
ns0.90
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1861958
ns2034500
ns0.92
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1876833
ns2032084
ns0.92
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3143021
ns3191167
ns0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
228687
ns202548
ns1.13
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI
10334254.5
ns11415659
ns0.91
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
743867
ns739097
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6167
ns5979.5
ns1.03
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6625
ns6312.5
ns1.05
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8333.5
ns8542
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
7375
ns6542
ns1.13
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
83073.5
ns84957.5
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
5613807.5
ns5409712
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
65621
ns66831
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11042
ns11937.5
ns0.92
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10958.5
ns11541.5
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11645.5
ns11604
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12812.5
ns10583
ns1.21
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
595390
ns561493
ns1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
37940094
ns37617116
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
408370.5
ns405534
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
541
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23168
ns20286
ns1.14
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI
2210796
ns2161771
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU
46950
ns51190
ns0.92
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2084
ns2084
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2125
ns2084
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2209
ns2208
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2084
ns2125
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
213006.5
ns223022
ns0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI
11081491
ns10990252.5
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU
181582.5
ns182361
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8834
ns8625
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8834
ns9520.5
ns0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
10021.5
ns9334
ns1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
8500
ns7917
ns1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
99705.5
ns108611
ns0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
3198646
ns3137439.5
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
72221
ns74611
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
18834
ns18395.5
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17479
ns16917
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18458
ns18854
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
18895.5
ns18396
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
566743
ns518312
ns1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
18116750
ns16860013
ns1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
377315
ns380934
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
500
ns458
ns1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
500
ns500
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
584
ns708
ns0.82
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
541
ns500
ns1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
33362
ns27063
ns1.23
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
1254721
ns1178178
ns1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
46210
ns46160
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8916.5
ns8500
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9479.5
ns9020.5
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9667
ns9208.5
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9250
ns8937.5
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
255341
ns166677
ns1.53
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
18499322
ns18801518
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
366854.5
ns371663
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
398042
ns397208
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288250
ns288208.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288042
ns288000
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
755958
ns756583
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
112430
ns110755
ns1.02
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI
338275
ns333813
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU
74831
ns75971
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1408834
ns1448374.5
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1134937.5
ns1133083
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1133167
ns1131833
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2438875
ns2357875
ns1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
198896
ns177520.5
ns1.12
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI
10071273
ns10029153
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU
320874
ns322173
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7270.5
ns7291.5
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7583
ns6875
ns1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8854.5
ns8666
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6917
ns7208
ns0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
136778.5
ns110478
ns1.24
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
5388548.5
ns5505252
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
66211
ns65640
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14833.5
ns12145.5
ns1.22
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15791
ns14167
ns1.11
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14229.5
ns13792
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14709
ns14729
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
897273
ns664318.5
ns1.35
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
42742507.5
ns42216111.5
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
425150
ns426745
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
27562.5
ns24770.5
ns1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
25583
ns28375
ns0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
30228.5
ns30459
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
28854
ns25729.5
ns1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
185009
ns167386
ns1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7764877.5
ns7615563
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
115321
ns113401
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
106417
ns151292
ns0.70
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
151645.5
ns151187.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
153166
ns153583
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
150583
ns143875
ns1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
996849
ns857621
ns1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
42717639
ns44631154
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
587287
ns587816
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
74375
ns79833
ns0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
75999.5
ns85583.5
ns0.89
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
80375
ns80437
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
87000
ns73583
ns1.18
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
189182
ns168427.5
ns1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7755648
ns7736056
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
128191.5
ns129412
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
295291
ns285333
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
319708
ns300667
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
247791.5
ns300791.5
ns0.82
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
273792
ns222625
ns1.23
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1010749
ns971830
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
41903716
ns41332252
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
697424
ns696216
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
13500
ns17000
ns0.79
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
13209
ns16833
ns0.78
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14167
ns17125
ns0.83
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
13125
ns16542
ns0.79
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
136045
ns112981
ns1.20
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
5580516
ns5793916
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
233743
ns231572
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26604
ns28083.5
ns0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26187.5
ns26500
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
26625
ns28083.5
ns0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
28208.5
ns27187.5
ns1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
900814
ns696173.5
ns1.29
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
41017058
ns41169551
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
690428
ns689617
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
12000
ns10292
ns1.17
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
11896
ns11333.5
ns1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
12459
ns11750
ns1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
10833
ns10250
ns1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
117378.5
ns111360
ns1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
3507903
ns3372766
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
236503
ns235923
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
22417
ns23687.5
ns0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
22958
ns21375
ns1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
23875
ns22583
ns1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
22583
ns22375
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
660570
ns554045
ns1.19
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
20618992
ns22400526.5
ns0.92
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
675828
ns674936
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
64666
ns63875
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
67083.5
ns65292
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
66167
ns66458
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
66875
ns62667
ns1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
100086
ns96846
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3307399.5
ns3400257
ns0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
234107.5
ns235422
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
465000
ns437167
ns1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
466167
ns485500
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
468625
ns486250
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
503833
ns442291
ns1.14
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
483663
ns440935
ns1.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
21199224
ns20393573
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
709238
ns716017
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7562.5
ns7208
ns1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8083
ns7250
ns1.11
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8250
ns8646
ns0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7083.5
ns6812.5
ns1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
134375
ns113059.5
ns1.19
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
5976128
ns5983032
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
65651
ns64461
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14041.5
ns11875
ns1.18
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13125
ns13583
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14479
ns14854.5
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14625
ns14750
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
872555
ns670072
ns1.30
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
40293875
ns40018921
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
400284
ns400084
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
6157812.5
ns6149145.5
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
6375333.5
ns6373791
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
6376917
ns6369958
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
11913125
ns11914917
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
346601.5
ns348199
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/oneAPI
53593217
ns55221895
ns0.97
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU
320474
ns318854
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
19110896
ns19112395.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
19977396
ns19954875
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
19903104
ns19933333
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
36496187.5
ns36546937.5
ns1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1012562
ns1032394
ns0.98
batchedmm(512, Bsize=4)/zygote/GPU/oneAPI
77852170.5
ns78448314.5
ns0.99
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU
1157544
ns1157393
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1000
ns958
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1000
ns958
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1042
ns1000
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
958
ns958
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
22944
ns20220
ns1.13
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI
2044697
ns2011379
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU
207642
ns207432
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
3917
ns3667
ns1.07
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4000
ns3667
ns1.09
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4042
ns3750
ns1.08
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4000
ns3625
ns1.10
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
269119
ns242662
ns1.11
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI
11661739
ns11613706.5
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
625997
ns625907
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8437.5
ns7229.5
ns1.17
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8895.5
ns8208
ns1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9562
ns9063
ns1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
8375
ns7833
ns1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
113535
ns110132.5
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
3443497.5
ns3376276
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
68271
ns72491
ns0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11896
ns11792
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
12021
ns11708
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
12792
ns12833
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
12583
ns12042
ns1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
597497
ns533463.5
ns1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
21602127
ns22224767.5
ns0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
354444
ns357164
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
291
ns250
ns1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns292
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22361
ns20014
ns1.12
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI
1916584
ns2044805
ns0.94
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU
46890
ns46611
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
3000
ns3167
ns0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2958
ns2875
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3333
ns3083
ns1.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2917
ns2834
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
193738
ns168487.5
ns1.15
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI
9462333
ns9185467
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU
156212
ns163482
ns0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10583
ns11500
ns0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11875
ns11292
ns1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
13083.5
ns13562.5
ns0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
12062.5
ns9687.5
ns1.25
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
113976.5
ns110742.5
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
3275659
ns3318937
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
236063
ns234383
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
21833.5
ns22041.5
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
22145.5
ns21312.5
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
23875
ns22292
ns1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
22333
ns21375.5
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
547934
ns445412.5
ns1.23
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20491745
ns20307385
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
654438
ns648033
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4375
ns4375
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4458
ns4417
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4417
ns4417
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4417
ns4375
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
23860
ns21103
ns1.13
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI
2144860.5
ns2254531
ns0.95
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU
49061
ns47271
ns1.04
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16375
ns16542
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16666
ns16458
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16666
ns16667
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16541
ns16542
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
316685
ns292441
ns1.08
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI
12062386.5
ns12584045
ns0.96
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU
209243
ns206702.5
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
2000
ns2041
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
2084
ns2042
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
2209
ns2083
ns1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
2208
ns1916
ns1.15
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
34477
ns27885
ns1.24
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
1229094
ns1248055
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
203202
ns203262
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
18604
ns16833
ns1.11
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
18708
ns18250
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
18833.5
ns18125
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
21208.5
ns17667
ns1.20
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
282309
ns178504
ns1.58
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
21098361
ns21525405
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
686013
ns684992
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
59292
ns59104
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
64917
ns65041
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
66458
ns66583.5
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
51625
ns51125
ns1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66488
ns71334
ns0.93
batchedmm(16, Bsize=512)/forward/GPU/oneAPI
88258686
ns89279199
ns0.99
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU
118491
ns118362
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
175916.5
ns163062.5
ns1.08
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
153479
ns151271
ns1.01
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
160333.5
ns157250
ns1.02
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
224542
ns313146
ns0.72
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
208290.5
ns195169
ns1.07
batchedmm(16, Bsize=512)/zygote/GPU/oneAPI
149475929.5
ns151578490.5
ns0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU
608982
ns624817
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
81083
ns82145.5
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
83270.5
ns82749.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
124833.5
ns86667
ns1.44
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
85395.5
ns85000
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
192029
ns186525
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5900244
ns5756836
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
202972.5
ns205352
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1881145.5
ns1808020.5
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1912667
ns1915916.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1916083
ns1905270.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1849250
ns1911375
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
499932
ns475978
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
26802673
ns27045542
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1066872
ns1069182
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns291
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns291
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
21422.5
ns18638
ns1.15
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI
2063314.5
ns2108817.5
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU
41850
ns42830
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1792
ns1792
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1834
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1834
ns1833
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1875
ns1791
ns1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
241279.5
ns225657
ns1.07
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI
9975087
ns9833710
ns1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU
180262
ns182527.5
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
8166
ns8375
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
10292
ns9125
ns1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
11208
ns11083
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
11042
ns8041
ns1.37
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
113299.5
ns108185.5
ns1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
3500381.5
ns3365841
ns1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
233333
ns232582
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9917
ns10167
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9834
ns9542
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
11458
ns10417
ns1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10417
ns9291
ns1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
484445
ns420282.5
ns1.15
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
18749564
ns20467429
ns0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
627157
ns629687
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58375
ns57916
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47209
ns46583
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46833
ns46458
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82625
ns83583
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
38276
ns32500
ns1.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1341940
ns1374457
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
75211
ns72281
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1836770.5
ns1911000
ns0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1985937.5
ns1970187.5
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1978479
ns1937771
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1854291.5
ns1899667
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
209126
ns176646
ns1.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
33357124
ns33503348
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1011361
ns1152023
ns0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
267437.5
ns418084
ns0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
270417
ns417375
ns0.65
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
270625
ns427542
ns0.63
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
268604.5
ns420250
ns0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
193011.5
ns173254.5
ns1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7986425
ns7736703
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
282544
ns280773
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
588125
ns671833.5
ns0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
688229.5
ns766666.5
ns0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
688292
ns684542
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
593500
ns731041.5
ns0.81
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
985216
ns887279
ns1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
43272459
ns46741128
ns0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
911561
ns905534.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2205542
ns3464375
ns0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2194083.5
ns3437833
ns0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2213708
ns3397500
ns0.65
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2176167
ns3449958
ns0.63
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
153511
ns148014
ns1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8157200
ns8945738
ns0.91
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
445380
ns441160
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5508979.5
ns6193666.5
ns0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5521979
ns6178645.5
ns0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5474458
ns6207958
ns0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5531895.5
ns6230917
ns0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
925959.5
ns821729
ns1.13
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
50527002
ns51511265
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1539832.5
ns1636158
ns0.94
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
478666
ns473083.5
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
346145.5
ns342041.5
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
346083
ns341500
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
909333
ns902375
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46203
ns42882
ns1.08
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI
382606
ns400566
ns0.96
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU
242913
ns241152
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2111749.5
ns2324750
ns0.91
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1861166.5
ns2038541.5
ns0.91
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1866541
ns2032354
ns0.92
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3130375
ns3197000
ns0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
258500
ns202331
ns1.28
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI
15052922
ns12642725
ns1.19
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
773039
ns763338
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58125
ns57520.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46334
ns46395.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46167
ns45959
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82542
ns83250
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
27952
ns23227
ns1.20
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1310631
ns1432334
ns0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
73681
ns75651
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2039458
ns2029625
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2089729.5
ns2079979
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2087020.5
ns2070791
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1978124.5
ns2000354
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
221951
ns191580
ns1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
36802380
ns35959863
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1041362
ns1041881.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58417
ns57291
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46958
ns46645.5
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46417
ns46625
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82334
ns83334
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
47697.5
ns40746
ns1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
816463
ns810264
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
71371
ns80396
ns0.89
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1926479.5
ns1890166
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1973250
ns1976042
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1973167
ns1971667
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1898833
ns1895583
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
228428
ns198522
ns1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
17513790
ns17337732
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1026836.5
ns936080
ns1.10
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
292
ns291
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
333
ns291
ns1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
416
ns416
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
334
ns333
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
33167
ns25307.5
ns1.31
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
1174385.5
ns1259521
ns0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
48501
ns46650
ns1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6416
ns6562.5
ns0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6834
ns6917
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7250
ns7292
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6333
ns6834
ns0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
199581
ns168328.5
ns1.19
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
19880225
ns20601648
ns0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
363764
ns371864
ns0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
291
ns291
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns291
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
291
ns291
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32517
ns30302
ns1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI
1265101
ns1177600.5
ns1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU
37771
ns37815.5
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
3417
ns3542
ns0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
3375
ns2833
ns1.19
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
3167
ns3250
ns0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2792
ns2959
ns0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
182053
ns169119
ns1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI
9212477.5
ns7614831
ns1.21
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU
158127
ns152811
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
460687.5
ns450021
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
478208.5
ns441041
ns1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
500000
ns425041.5
ns1.18
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
470937
ns422292
ns1.12
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
134071
ns130746.5
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5855749
ns6115924
ns0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
366189
ns366698.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4078667
ns3801375
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4067771
ns3799958
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4080625
ns3805000
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4056354
ns3829062.5
ns1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
664164.5
ns640512
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31731318
ns35444962
ns0.90
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1467136
ns1468321
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
49955792
ns49831750
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
35488958
ns35529708
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
35531584
ns35490875
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
97090437.5
ns97095125
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1601101.5
ns1612269
ns0.99
batchedmm(512, Bsize=32)/forward/GPU/oneAPI
55729446
ns56680008
ns0.98
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU
1044391.5
ns1041171
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
154677542
ns154466500.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
112413020.5
ns112376375
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
112347584
ns112311958
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
295444937.5
ns295244375
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6489665.5
ns6476168
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/oneAPI
124609188
ns174388525
ns0.71
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU
5586056.5
ns5549710
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
19520.5
ns16979
ns1.15
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
17458
ns19562.5
ns0.89
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
17417
ns17188
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
15750
ns15020.5
ns1.05
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
19821
ns14071
ns1.41
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI
1180885
ns1254861
ns0.94
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU
26420
ns25910
ns1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
10959
ns10520.5
ns1.04
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
9125.5
ns8709
ns1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
9084
ns8917
ns1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
17167
ns17479
ns0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
242736.5
ns209068
ns1.16
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI
10064346.5
ns10230351.5
ns0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU
149096.5
ns148622
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8208.5
ns7750
ns1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
10687.5
ns7854.5
ns1.36
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
10604.5
ns10334
ns1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
8959
ns7583
ns1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
116211.5
ns111568.5
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
3615906
ns3718095.5
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
234342
ns237553
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10292
ns11541.5
ns0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10209
ns9687.5
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11292
ns10708
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9437.5
ns10709
ns0.88
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
575593
ns501739
ns1.15
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
22955140
ns23065545
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
652487
ns655677
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9250
ns8770.5
ns1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9875
ns9750
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
11041.5
ns10583
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10333.5
ns8750
ns1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
113252
ns53968
ns2.10
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
3518835.5
ns3498205
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
72611
ns72631
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
16542
ns13459
ns1.23
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
15833
ns15479
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
15750
ns19209
ns0.82
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
16708
ns14125
ns1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
548922
ns250540
ns2.19
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
19847936.5
ns20620278
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
343724
ns346043
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
500
ns459
ns1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
584
ns500
ns1.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
625
ns583
ns1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
584
ns458
ns1.28
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
33202
ns26861
ns1.24
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
1238450.5
ns1254571
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
204092
ns204762
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8916
ns7208.5
ns1.24
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9292
ns9000
ns1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9958
ns9125
ns1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
12292
ns8166
ns1.51
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
220226.5
ns147122.5
ns1.50
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
21905010
ns22634021
ns0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
657387
ns659287
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
17625
ns15416
ns1.14
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
15958
ns16625
ns0.96
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
15209
ns14917
ns1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
11291
ns11291
ns1
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
19970
ns13973
ns1.43
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI
1162812
ns1108916
ns1.05
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU
188782
ns186562
ns1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
35458
ns32000
ns1.11
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
35562
ns32000
ns1.11
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
35645.5
ns31958
ns1.12
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
35542
ns32167
ns1.10
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
255892
ns109160
ns2.34
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI
10845224.5
ns11487029
ns0.94
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
591957
ns588817
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
448958
ns492875
ns0.91
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
453750
ns442125
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
492166
ns444958
ns1.11
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
453875
ns440604
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193846
ns188096.5
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6007739
ns5891615
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
367744
ns369779
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4069208
ns3834584
ns1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4054291.5
ns3827292
ns1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4049270.5
ns3817250
ns1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4057500
ns3836104.5
ns1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
505408
ns382999
ns1.32
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
37330396
ns28452071
ns1.31
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1362695
ns1355634
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
779601166
ns831622791.5
ns0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
542496166
ns544951167
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
539989666
ns544430500
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
1569938708
ns1552948271
ns1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22536712.5
ns22763244.5
ns0.99
batchedmm(512, Bsize=512)/forward/GPU/oneAPI
187859757.5
ns185795205
ns1.01
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU
14732780
ns15420059
ns0.96
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
2505560125
ns3888050458
ns0.64
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1783555333
ns3211667750
ns0.56
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1792629375
ns1819585250
ns0.99
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
5216869375
ns4769468292
ns1.09
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
118336848
ns118595684
ns1.00
batchedmm(512, Bsize=512)/zygote/GPU/oneAPI
935397218
ns1039230192
ns0.90
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU
88936600
ns88183228
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
78854.5
ns75333.5
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
76791
ns77458
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
79000
ns78584
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
79354
ns76292
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
190682.5
ns93335
ns2.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
5473671
ns6083372
ns0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
108351
ns120232
ns0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
294125
ns279333
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
289958
ns194937.5
ns1.49
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
261417
ns234771
ns1.11
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
238520.5
ns194125
ns1.23
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
986968
ns451188
ns2.19
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
46526863
ns46239896
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
636237
ns657366.5
ns0.97
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
199699479
ns199509499.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
139060584
ns139162834
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
139030750
ns138977625
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
388620875
ns388989959
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5814292
ns5833602
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/oneAPI
80005938
ns79568180.5
ns1.01
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU
3574368
ns3573358
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
621021958
ns619161479.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
439183125
ns440796833
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
439329875
ns439294646
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
1194801708
ns1189363000
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
26594102
ns26219564
ns1.01
batchedmm(512, Bsize=128)/zygote/GPU/oneAPI
295168041
ns283162239
ns1.04
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU
22131978
ns21927537.5
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7291
ns7417
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6291
ns6083.5
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6125
ns6208
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9959
ns10042
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26445
ns21654
ns1.22
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1270170
ns1302067
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
48281
ns48161
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
216209
ns212583
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220416.5
ns228396
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
222625
ns222250
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
219125
ns213166.5
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
214078.5
ns136607.5
ns1.57
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
29452909
ns29564519.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
522765
ns524845
ns1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
9416.5
ns9833.5
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9041
ns7979
ns1.13
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
9833.5
ns10250
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
10791.5
ns7978.5
ns1.35
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
110026
ns51011
ns2.16
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
3375913.5
ns3317085
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
72150
ns69811
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10854.5
ns7333.5
ns1.48
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7750
ns9625
ns0.81
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9708
ns13562.5
ns0.72
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
11250
ns8250
ns1.36
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
484552.5
ns242632
ns2.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
18934737
ns19151322
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
313639
ns316738.5
ns0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
459
ns416
ns1.10
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
500
ns708
ns0.71
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
542
ns500
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
667
ns500
ns1.33
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
24574
ns19752
ns1.24
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
1221655.5
ns1203125.5
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
46721
ns46481
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
12292
ns8625
ns1.43
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
8896
ns10249.5
ns0.87
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10583
ns9500
ns1.11
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13666
ns10292
ns1.33
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
243553
ns119755.5
ns2.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
23025269
ns25677237
ns0.90
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
386704
ns388684
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
111083
ns105959
ns1.05
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
102541.5
ns98500
ns1.04
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
103792
ns101021
ns1.03
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
155083.5
ns146271
ns1.06
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
22624
ns16996
ns1.33
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI
791164
ns756914
ns1.05
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU
191512
ns190327
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
567500
ns478333
ns1.19
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
573417
ns509583
ns1.13
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
549583.5
ns478459
ns1.15
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
537292
ns478458.5
ns1.12
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
213930
ns113991
ns1.88
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI
11700863.5
ns12514796
ns0.93
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
608337
ns604977
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
5750
ns5375
ns1.07
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
5167
ns5333
ns0.97
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
7667
ns7208
ns1.06
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
4563
ns6729
ns0.68
batchedmm(16, Bsize=32)/forward/GPU/CUDA
16559
ns15434
ns1.07
batchedmm(16, Bsize=32)/forward/GPU/oneAPI
73950991
ns73679048
ns1.00
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU
80275.5
ns79381
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
11958
ns12375
ns0.97
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
10750
ns11000
ns0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
11583
ns10875
ns1.07
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
18167
ns16625
ns1.09
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
203546
ns108121
ns1.88
batchedmm(16, Bsize=32)/zygote/GPU/oneAPI
98437217
ns100453387
ns0.98
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU
367244
ns364504
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
38958
ns39375
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
51125
ns51917
ns0.98
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
52458
ns52770.5
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
13770.5
ns13604
ns1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA
20666.5
ns20011
ns1.03
batchedmm(16, Bsize=128)/forward/GPU/oneAPI
77382892
ns79258230
ns0.98
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU
87361
ns85481
ns1.02
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
36416
ns36271
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
30770.5
ns35313
ns0.87
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
35250
ns31291.5
ns1.13
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
58812.5
ns57750
ns1.02
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
180756
ns121997.5
ns1.48
batchedmm(16, Bsize=128)/zygote/GPU/oneAPI
110794943
ns113144013
ns0.98
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU
408754
ns410244.5
ns1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
1750
ns1584
ns1.10
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
1875
ns1750
ns1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
2125
ns2250
ns0.94
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
1833
ns1687.5
ns1.09
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
19320
ns13818
ns1.40
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI
1202099
ns1224877
ns0.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU
33080
ns32640
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
2395.5
ns2166
ns1.11
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
2333
ns2292
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
2375
ns2417
ns0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
2375
ns2250
ns1.06
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
193868.5
ns89827
ns2.16
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI
9197836
ns9149897
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU
137016.5
ns136461
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5292
ns5666.5
ns0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5750
ns4896
ns1.17
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6208
ns6333.5
ns0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5958.5
ns5375
ns1.11
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
139204.5
ns59437.5
ns2.34
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
5728892
ns5810721.5
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
69071
ns68755.5
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8667
ns8167
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8625
ns8542
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8792
ns8500
ns1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9145.5
ns9042
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
809144
ns383098.5
ns2.11
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
39925856
ns38586019
ns1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
387074
ns387674
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
55125
ns56708
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
55958
ns57666
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
56042
ns57625
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
56208
ns58250
ns0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35813.5
ns30235
ns1.18
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1246242
ns1254024
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
202752
ns204092
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
489125
ns448000
ns1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
532541.5
ns472083.5
ns1.13
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
505645.5
ns465125
ns1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
470521
ns436541.5
ns1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
253767
ns170026
ns1.49
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26667416
ns28109365.5
ns0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
833929
ns826388
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
3319083.5
ns3312500
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
2337292
ns2340084
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
2337917
ns2339583.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
6313500
ns6318792
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
204383
ns204725
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/oneAPI
80623182
ns83409682
ns0.97
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU
213737
ns240632
ns0.89
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
11497229
ns11441604
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
8328208.5
ns8301208
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
8338541.5
ns8329792
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
21078124.5
ns21184729.5
ns0.99
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
737191.5
ns760406.5
ns0.97
batchedmm(128, Bsize=128)/zygote/GPU/oneAPI
126245472
ns125395684.5
ns1.01
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU
1058001
ns1063686
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4750
ns5666
ns0.84
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6875
ns5604.5
ns1.23
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6874.5
ns6438
ns1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6791.5
ns6312.5
ns1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
130168
ns57453
ns2.27
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
5745155
ns5296827
ns1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
56791
ns56241
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7333
ns7125
ns1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7312.5
ns7333
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9042
ns7250
ns1.25
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7375
ns8292
ns0.89
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
712471
ns367190
ns1.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
35913527
ns35508394
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
368228.5
ns362159
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
150042
ns140708
ns1.07
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
93750
ns123917
ns0.76
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
126666
ns100667
ns1.26
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
97708
ns104958
ns0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
148678
ns127546.5
ns1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5748457.5
ns6179687.5
ns0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
203522
ns206197
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2036375
ns1992625
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2027000.5
ns2016083.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2032104
ns2019875
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2023625
ns2026687.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
663877
ns432468
ns1.54
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
33751272
ns33529611.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1110211
ns1184812.5
ns0.94
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
34208
ns32208.5
ns1.06
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
36458
ns37167
ns0.98
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
36083
ns35833
ns1.01
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
708
ns583
ns1.21
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15530
ns13995
ns1.11
batchedmm(2, Bsize=4)/forward/GPU/oneAPI
73822262.5
ns74212471
ns0.99
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU
78911
ns79370
ns0.99
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2542
ns2645.5
ns0.96
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2833.5
ns2750
ns1.03
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
3500
ns3020.5
ns1.16
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2209
ns2333
ns0.95
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
136004.5
ns92140
ns1.48
batchedmm(2, Bsize=4)/zygote/GPU/oneAPI
93721653
ns94219800
ns0.99
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU
339263
ns341683
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7250
ns7167
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6084
ns6000
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6083
ns6083
ns1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10125
ns10000
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35188
ns29283
ns1.20
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1207919
ns1222925.5
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
48090
ns48131
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
244041.5
ns248271
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
227416.5
ns221125
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
224625
ns221042
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
216417
ns216625
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
236066
ns158765.5
ns1.49
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
28254417
ns26714332
ns1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
573176
ns569975.5
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3917
ns3959
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3958
ns4000
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3958
ns3958
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3917
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
21615
ns18821
ns1.15
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI
2072507
ns2189549
ns0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU
42031
ns41970
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14666
ns14958
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14958
ns15125
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14916.5
ns14917
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14917
ns14666
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
297040
ns163909.5
ns1.81
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI
11259133
ns11534435
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU
188487
ns192582
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
120896
ns146416
ns0.83
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
103687.5
ns103750
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
130792
ns103791
ns1.26
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
100583
ns100208
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
149147
ns127104
ns1.17
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6201158
ns6092161.5
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
204362
ns207422.5
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1925625
ns1791000
ns1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1922584
ns1909958
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1924687.5
ns1910875
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1918000
ns1922250
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
656144
ns418877
ns1.57
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
29883253.5
ns29586225
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1218242.5
ns1089381
ns1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19042
ns17291
ns1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
21375
ns22583
ns0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
20375
ns21062.5
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18625
ns17417
ns1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
102936.5
ns61422.5
ns1.68
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3227536
ns3492174
ns0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
80560.5
ns80420
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
216541.5
ns216354.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
239771
ns256145.5
ns0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
223709
ns216500
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
247021
ns219146
ns1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
493608
ns272581
ns1.81
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
19781802
ns19535498.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
479335
ns477435
ns1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
24792
ns26813
ns0.92
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
30625
ns31333
ns0.98
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
29334
ns28812.5
ns1.02
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
1291
ns1312
ns0.98
batchedmm(16, Bsize=4)/forward/GPU/CUDA
15803
ns14764
ns1.07
batchedmm(16, Bsize=4)/forward/GPU/oneAPI
73776659
ns75193108
ns0.98
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU
81571
ns81295.5
ns1.00
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
4770.5
ns5000
ns0.95
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
5125
ns4833.5
ns1.06
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
5396
ns5083.5
ns1.06
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
4500
ns4854
ns0.93
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
200310
ns110219
ns1.82
batchedmm(16, Bsize=4)/zygote/GPU/oneAPI
93984240
ns96235287
ns0.98
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU
379744
ns380423.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
226292
ns304917
ns0.74
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
223000
ns306417
ns0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
224167
ns307500
ns0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
224042
ns306312
ns0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
218225
ns96559
ns2.26
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7741481.5
ns8040746
ns0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
274597.5
ns273553
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
510000
ns534959
ns0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
507812.5
ns578875
ns0.88
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
554604
ns532250
ns1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
496958
ns532292
ns0.93
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1023666.5
ns478700.5
ns2.14
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
43947953
ns45273096.5
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
871339
ns854594
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
21084
ns18833
ns1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
19896
ns21500
ns0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21792
ns21500
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
20375
ns18729
ns1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
110346
ns61059
ns1.81
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3666139
ns3648054
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
79030
ns79701
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212625
ns225292
ns0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220750
ns215459
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
216750
ns214416
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215292
ns215625
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
706350
ns315781.5
ns2.24
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
24785967
ns25685453.5
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
536195
ns536640.5
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6292
ns6875
ns0.92
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6500
ns6729
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8667
ns7875.5
ns1.10
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
7083
ns6187
ns1.14
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
133065.5
ns59473
ns2.24
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
5772071
ns5742399
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
65491
ns65660
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10416
ns9875
ns1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9750
ns10541.5
ns0.92
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10291
ns10542
ns0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10959
ns11395.5
ns0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
769680.5
ns375474.5
ns2.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
39162885
ns37560344
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
386734
ns385404
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5333
ns4958
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5792
ns5792
ns1
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7416
ns6937
ns1.07
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6833
ns4813
ns1.42
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
135171
ns59336
ns2.28
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
5588783
ns5881412.5
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
68351
ns66901
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7666
ns7333
ns1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7375
ns7292
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7646
ns7625
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7458
ns7917
ns0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
735742.5
ns400389
ns1.84
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
40095077.5
ns41438719.5
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
390854
ns390804
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
14524375
ns14514708
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
10107916
ns10142334
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
10121667
ns10128041
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
27752459
ns27891250
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA
528552
ns532579.5
ns0.99
batchedmm(128, Bsize=512)/forward/GPU/oneAPI
98194699
ns99192089
ns0.99
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU
402574
ns394344
ns1.02
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
46533666.5
ns46256625
ns1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
33493083
ns33475978.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
33509291
ns33502666
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
85143125
ns85530791
ns1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2860259.5
ns3411776.5
ns0.84
batchedmm(128, Bsize=512)/zygote/GPU/oneAPI
194954753.5
ns197868624
ns0.99
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU
3304313.5
ns3281874
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
67916
ns66792
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
66833.5
ns67791
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
69271
ns69583
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
70875
ns66542
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
101002.5
ns63585
ns1.59
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3722652
ns3635639
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
234933
ns238943
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
480708.5
ns482792
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
482250
ns490208.5
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
478395.5
ns443416
ns1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
469167
ns443250
ns1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
703449
ns333625.5
ns2.11
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
28256303
ns27824814.5
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
791863.5
ns796928.5
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
500
ns500
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
583
ns584
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns584
ns1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
584
ns541
ns1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
32037
ns26261
ns1.22
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
1235389
ns1201042
ns1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
49701
ns46591
ns1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8959
ns9624.5
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
8667
ns9458
ns0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9562.5
ns9042
ns1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8250
ns15042
ns0.55
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
277434
ns154087.5
ns1.80
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
22231771.5
ns22365155
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
376664
ns376374
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
9625
ns9792
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
9708
ns9875
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
9667
ns9834
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
9625
ns9792
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
23071
ns21245
ns1.09
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI
2068828
ns2109275
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU
210172
ns207407
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
50291
ns45834
ns1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
50417
ns46083
ns1.09
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
50875
ns48125
ns1.06
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
50583
ns46000
ns1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
274332
ns181985
ns1.51
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI
12829198
ns12501764
ns1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
609897
ns599026
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
55125
ns56292
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
55875
ns57208
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
55917
ns57083
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
55917
ns57875
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
27736
ns22599
ns1.23
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1172928.5
ns1231700.5
ns0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
203487.5
ns210062.5
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
530167
ns491041.5
ns1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
505208
ns503250
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
508854.5
ns465875
ns1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
467249.5
ns440959
ns1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
234697
ns153666
ns1.53
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
33825831.5
ns33436886
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
884779
ns880644
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
652145.5
ns646396
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
645666
ns656479
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
651250
ns592854.5
ns1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
641645.5
ns616145.5
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
186754
ns128259.5
ns1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8762924.5
ns8403444.5
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
301613
ns302363
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2247709
ns2232145.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2260312.5
ns2230708
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2234500
ns2231875
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2220417
ns2259375
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
904539
ns617840
ns1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
49644380
ns50658009
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1208692
ns1318863
ns0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
21125
ns22542
ns0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
23708.5
ns19458
ns1.22
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22229
ns22583
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
22083
ns19458
ns1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
106895
ns64266
ns1.66
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3512219
ns3671624.5
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
79091
ns79151
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
265500
ns224000
ns1.19
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
222041
ns254083
ns0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
232667
ns221708
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
258208
ns220750
ns1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
700202.5
ns347077
ns2.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26916118
ns25817148
ns1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
555335
ns554175.5
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
500
ns541
ns0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
583
ns583
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
625
ns584
ns1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
583
ns500
ns1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
22789
ns18626
ns1.22
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
1272891.5
ns1230354
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
48001
ns48171
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9083
ns8917
ns1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9083
ns9875
ns0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9938
ns9812.5
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9875
ns9270.5
ns1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
259037
ns136039.5
ns1.90
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
24963258.5
ns29131754
ns0.86
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
395944
ns399044
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7875
ns8042
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
10062.5
ns9312.5
ns1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10520.5
ns11292
ns0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
10583
ns8334
ns1.27
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
113441
ns57821.5
ns1.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
3336239
ns3436196.5
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
70210
ns69710.5
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7833.5
ns7125
ns1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7625
ns7791
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7959
ns7875
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7708
ns7417
ns1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
472332
ns255977.5
ns1.85
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
17695066.5
ns18497059
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
319678
ns319743
ns1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1625
ns1375
ns1.18
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1916
ns1645.5
ns1.16
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2020.5
ns1917
ns1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1583
ns1500
ns1.06
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
19708
ns13693.5
ns1.44
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI
1164130
ns1186814
ns0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU
189672
ns188882
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
3584
ns3291
ns1.09
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
3750
ns3479.5
ns1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
3792
ns3625
ns1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
3584
ns3291
ns1.09
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
208603
ns104102.5
ns2.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10416766
ns10382640
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
583536
ns575736
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
148916.5
ns146979
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
127916
ns129042
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
130229
ns129875
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
225208
ns226000
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
22520
ns17312
ns1.30
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI
1193772.5
ns1216751.5
ns0.98
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU
40501
ns39935.5
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
160729.5
ns159771
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
123458
ns110521
ns1.12
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
114792
ns136250
ns0.84
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
264249.5
ns251666.5
ns1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
208808
ns118480.5
ns1.76
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI
10974999
ns10669966
ns1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU
268837.5
ns265838
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7333
ns7292
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5959
ns6041
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5959
ns6042
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10042
ns10250
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
32455
ns26774
ns1.21
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1162105
ns1208039.5
ns0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
50660
ns48681
ns1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
231563
ns219937.5
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
235208
ns227375
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
235250
ns228667
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214541.5
ns212729.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
252456
ns177762.5
ns1.42
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
25967285
ns28372083
ns0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
597316
ns589856
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12542
ns15958
ns0.79
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
12833
ns16208.5
ns0.79
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
14188
ns16687.5
ns0.85
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
13625
ns14792
ns0.92
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
131390
ns63622
ns2.07
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
5550478
ns5760147.5
ns0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
233213
ns227543
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
23666
ns23916
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
23229.5
ns24500
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24625
ns23458
ns1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
23834
ns23000
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
815266
ns431176
ns1.89
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
40999180
ns42796325.5
ns0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
686982
ns675657
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8667
ns9167
ns0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
10208
ns9834
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
10729
ns11021
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
9250
ns8729.5
ns1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
116988
ns64004
ns1.83
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
3575157.5
ns3525023
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
73990
ns73491
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14459
ns14292
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13833
ns13729
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14395.5
ns14208
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14250
ns13459
ns1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
625118
ns323073
ns1.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
21155361
ns21480471
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
365734
ns371404
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
8959
ns8083
ns1.11
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9709
ns10416.5
ns0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10854.5
ns10937.5
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10291
ns9333
ns1.10
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
116936.5
ns66250
ns1.77
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
3357605
ns3712952
ns0.90
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
73371
ns74871
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12458
ns12708
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12270.5
ns13020.5
ns0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12937.5
ns13333.5
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13291
ns12417
ns1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
515797
ns286792
ns1.80
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
18556708
ns19725639
ns0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
340984
ns340593.5
ns1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
27562
ns29166
ns0.95
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
33833.5
ns34604
ns0.98
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
31542
ns32229.5
ns0.98
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
1750
ns1750
ns1
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16227
ns15001
ns1.08
batchedmm(2, Bsize=128)/forward/GPU/oneAPI
78590127
ns78965877
ns1.00
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU
81231
ns86890
ns0.93
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
5291.5
ns5125
ns1.03
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
4979.5
ns5062.5
ns0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
5416
ns5167
ns1.05
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
6458
ns6292
ns1.03
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
135144
ns99425.5
ns1.36
batchedmm(2, Bsize=128)/zygote/GPU/oneAPI
109913428
ns110379934
ns1.00
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU
370274
ns383544
ns0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
250
ns292
ns0.86
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns334
ns1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
375
ns292
ns1.28
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
23995
ns19905
ns1.21
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
1260687.5
ns1150337
ns1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
47300
ns48921
ns0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6417
ns6292
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6541
ns6458
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6584
ns6708
ns0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6708
ns6208
ns1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
180494.5
ns127212
ns1.42
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
23985458.5
ns23911059
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
386289
ns394834
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
2000
ns1958
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
2084
ns2041
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
2166
ns2042
ns1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
2125
ns1958
ns1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
25421
ns20510
ns1.24
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
1185083
ns1241051
ns0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
206112
ns210527
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
16896
ns16896
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17000
ns17125
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18333
ns16750
ns1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17292
ns15875
ns1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
264717
ns143434
ns1.85
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
25260741
ns25814251
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
702657
ns704697
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
166125
ns174541
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
177603.5
ns147000
ns1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
148958
ns152688
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
148917
ns150916
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
187074
ns165022.5
ns1.13
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7946915
ns7825451.5
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
226902
ns226202.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1327854.5
ns1318770.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1318125
ns1320292
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1326521.5
ns1329500
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1295625
ns1333791.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
844331.5
ns605687
ns1.39
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
47016714
ns46439481
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1001545
ns1061786
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
32583
ns25084
ns1.30
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
26000
ns25042
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
26541.5
ns27854
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
26124.5
ns24749.5
ns1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
226484
ns119756.5
ns1.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7837953
ns7727314
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
115201
ns116991
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
131875
ns131479
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
152250
ns171708
ns0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
153750
ns127521
ns1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
131625
ns117479
ns1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
970881
ns551436.5
ns1.76
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
45298380
ns45901726
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
614061
ns610436
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
250
ns291
ns0.86
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
334
ns375
ns0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
334
ns292
ns1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
22483
ns17730.5
ns1.27
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
1212860.5
ns1203450
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
49500
ns48751
ns1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6459
ns6416.5
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6417
ns6542
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6750
ns6833
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6563
ns6167
ns1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
197111
ns136341
ns1.45
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
25776994
ns25006525.5
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
390483
ns393949
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5750
ns6666
ns0.86
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6458
ns6625
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6979
ns6917
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5333
ns5666
ns0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
136326
ns72501
ns1.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
5775885
ns5996889
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
234652
ns233483
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10000
ns9917
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10334
ns10062.5
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10500
ns10250
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10125
ns9875
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
837095
ns493431
ns1.70
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
40912011
ns41270237
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
671137
ns675326
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
667
ns666
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
667
ns667
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
750
ns667
ns1.12
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
708
ns667
ns1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
22523
ns20029
ns1.12
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI
2081039
ns2098576
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU
208153
ns207872.5
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4833
ns4542
ns1.06
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4917
ns4625
ns1.06
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5250
ns4791
ns1.10
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4625
ns1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
217286
ns167220.5
ns1.30
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10433598
ns9409031.5
ns1.11
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
579966
ns577916
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7709
ns7854
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
9042
ns8875
ns1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
10083.5
ns9750
ns1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
9125
ns7334
ns1.24
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
114799.5
ns72767.5
ns1.58
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
3549639
ns3713250
ns0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
74946
ns77435.5
ns0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9125
ns8167
ns1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8459
ns8354.5
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8958
ns9041
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8542
ns8417
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
551169
ns372607.5
ns1.48
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
20846570.5
ns21133871
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
344893
ns345814
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
126604.5
ns125395.5
ns1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
129541
ns129042
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
130458
ns129959
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
182896
ns180916
ns1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46147
ns44539
ns1.04
batchedmm(128, Bsize=4)/forward/GPU/oneAPI
73869604
ns75228887
ns0.98
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU
104461
ns100291
ns1.04
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
341208
ns310917
ns1.10
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
327416
ns313833
ns1.04
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
345562.5
ns324083.5
ns1.07
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
569312.5
ns600354
ns0.95
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
183705
ns150808
ns1.22
batchedmm(128, Bsize=4)/zygote/GPU/oneAPI
92315110
ns91943409
ns1.00
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU
502435
ns502450
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
399333
ns396750
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288167
ns288145.5
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288020.5
ns287583
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
755875
ns756625
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
43522.5
ns40964
ns1.06
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI
1347689.5
ns1391370
ns0.97
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU
80731
ns81511
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1404291.5
ns1449583.5
ns0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1136208
ns1136667
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1136375
ns1134771
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2442125
ns2361041.5
ns1.03
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
242542
ns207930
ns1.17
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI
9970984
ns10356148
ns0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU
353108.5
ns355144
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
643667
ns647292
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
649416
ns578500
ns1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
646791.5
ns639416
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
640271.5
ns656333
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
184288
ns154081
ns1.20
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8427619.5
ns8771052.5
ns0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
303113
ns306073.5
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2480000
ns2453625
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2441417
ns2424291
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2445375
ns2442542
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2435667
ns2470583
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
927220.5
ns767152.5
ns1.21
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
53936788.5
ns52532777
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1316013
ns1399204.5
ns0.94
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
33875
ns32604
ns1.04
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
35271
ns36937.5
ns0.95
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
34937.5
ns34542
ns1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
917
ns917
ns1
batchedmm(2, Bsize=32)/forward/GPU/CUDA
15816
ns14042
ns1.13
batchedmm(2, Bsize=32)/forward/GPU/oneAPI
76295890
ns78232811.5
ns0.98
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU
79581
ns79530
ns1.00
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
3209
ns3000
ns1.07
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
3291
ns3084
ns1.07
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
3417
ns3333
ns1.03
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
3042
ns3000
ns1.01
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
134276.5
ns100283
ns1.34
batchedmm(2, Bsize=32)/zygote/GPU/oneAPI
97067229
ns96545751
ns1.01
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU
337123
ns337334
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
437667
ns405958
ns1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
437833
ns408209
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
438458.5
ns407958
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
447416.5
ns421459
ns1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
42161.5
ns36148.5
ns1.17
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1435529
ns1554049.5
ns0.92
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
241817.5
ns238757.5
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4145167
ns3868375
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4268333
ns3988562.5
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4271604
ns3992667
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4025417
ns3776708.5
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
230700.5
ns193888
ns1.19
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
36716035
ns37305285.5
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1427924
ns1433244
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3875
ns3917
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3958
ns3917
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3875
ns3917
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
34754
ns32924.5
ns1.06
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI
1243353
ns1242082
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU
40071
ns37990
ns1.05
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15458
ns15708
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
16042
ns15750
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15875
ns15958
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15625
ns15500
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
252802
ns189381
ns1.33
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI
8940938
ns9458441
ns0.95
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU
171532
ns169642
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
404167
ns404708
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
295916
ns295750
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
296417
ns295958
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
760709
ns761125
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113187
ns117898
ns0.96
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI
1044690.5
ns1045095
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU
89211
ns87241
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1444875
ns1478500
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1158416
ns1159645.5
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1158333
ns1158042
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2464875
ns2384583
ns1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
231034
ns189114
ns1.22
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI
10580994
ns9516529.5
ns1.11
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU
352438
ns351188
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
583
ns500
ns1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
625
ns583
ns1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
583
ns583
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
625
ns500
ns1.25
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
24556
ns18799
ns1.31
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
1215077.5
ns1188091
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
207412
ns205912
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7542
ns7292
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
7916
ns7583
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7958.5
ns7917
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7708
ns7375
ns1.05
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
202724.5
ns141427.5
ns1.43
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
26299646
ns26708173
ns0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
691927
ns683937
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
833708.5
ns833042
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
617667
ns621208
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
620250
ns621791
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
1558000
ns1550917
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA
134627
ns134056.5
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/oneAPI
75767504.5
ns77301649
ns0.98
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU
232042
ns227902
ns1.02
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
2690520.5
ns2692167
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
2001666.5
ns1995500
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
2002375
ns2004812.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
4923459
ns4935000
ns1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
232967
ns249781
ns0.93
batchedmm(128, Bsize=32)/zygote/GPU/oneAPI
99203033
ns100408887.5
ns0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU
768327.5
ns840633.5
ns0.91
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
250
ns291
ns0.86
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
334
ns375
ns0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
375
ns292
ns1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31737
ns25355
ns1.25
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
1097642
ns1311449
ns0.84
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
46990
ns46990
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6250
ns6209
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6334
ns6708.5
ns0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6667
ns6542
ns1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6500
ns6125
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
216848.5
ns157347
ns1.38
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
22868710.5
ns21691879
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
363084
ns365484
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1756083
ns2366042
ns0.74
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1773708.5
ns2395500
ns0.74
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1731875
ns2374083
ns0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1723709
ns2382167
ns0.72
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
185580
ns170643
ns1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8097715
ns8487051
ns0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
375774
ns374764
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4363834
ns4646208
ns0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4360063
ns4643687
ns0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4378875
ns4660250
ns0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4369520.5
ns4569374.5
ns0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
829356
ns714837
ns1.16
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
48033448.5
ns50175411.5
ns0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1396403.5
ns1351724
ns1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
7146
ns7208
ns0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
9833
ns7084
ns1.39
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7250
ns7208
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6875
ns6833
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
21835
ns16063.5
ns1.36
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI
1202854
ns1173405
ns1.03
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU
40090.5
ns39030
ns1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
68125
ns63792
ns1.07
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
66458.5
ns32833
ns2.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
51312.5
ns45917
ns1.12
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
32958.5
ns45229.5
ns0.73
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
205180
ns163785
ns1.25
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI
10713432
ns10469728.5
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU
269342.5
ns262653
ns1.03
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
22083.5
ns20584
ns1.07
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
25042
ns26208
ns0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
24666.5
ns23542
ns1.05
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
5583
ns5125
ns1.09
batchedmm(2, Bsize=512)/forward/GPU/CUDA
17692
ns16017
ns1.10
batchedmm(2, Bsize=512)/forward/GPU/oneAPI
89463574
ns90340662
ns0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU
84500.5
ns84110
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
12041
ns11791
ns1.02
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
10167
ns10229.5
ns0.99
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
10584
ns10625
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
17770.5
ns17895.5
ns0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
217435
ns159148
ns1.37
batchedmm(2, Bsize=512)/zygote/GPU/oneAPI
145607100.5
ns149555538
ns0.97
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU
372684
ns367264
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
406209
ns406500
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
297375
ns297583
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
297291
ns297250
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
762584
ns762791
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
46433
ns43249
ns1.07
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI
1403980.5
ns1362482
ns1.03
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU
89281
ns87411
ns1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1428979
ns1484125.5
ns0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1164271
ns1167542
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1168292
ns1161667
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2470833
ns2387604.5
ns1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
271835
ns213476
ns1.27
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI
11893591
ns13925589
ns0.85
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU
378099
ns377604
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
437000
ns433583
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
440041
ns436917
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
438959
ns436666
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
449417
ns448291
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
53469
ns45983
ns1.16
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1006988
ns1048211.5
ns0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
234822
ns234192
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4132333
ns3894625
ns1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4262646
ns4022709
ns1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4266645.5
ns4024624.5
ns1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4029729
ns3801916.5
ns1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
251074
ns210260
ns1.19
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31753545
ns32692776
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1374018.5
ns1361238
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
9542
ns8709
ns1.10
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
8167
ns7667
ns1.07
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
8167
ns7667
ns1.07
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
13417
ns12375
ns1.08
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
23409
ns20402
ns1.15
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI
2209102
ns2188548.5
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU
211472
ns210772
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
49709
ns45041
ns1.10
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
49667
ns45208
ns1.10
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
50250
ns45208
ns1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
49792
ns44708
ns1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
333916
ns253192
ns1.32
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI
10942581.5
ns14008146.5
ns0.78
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
658106.5
ns653917
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
84687.5
ns82979
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
90459
ns126104.5
ns0.72
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
85792
ns86229.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84021
ns84875
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
191047
ns184626
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5785931
ns6066708
ns0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
222432
ns219662
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2027833
ns2017833
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2014979.5
ns2016000
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2016229.5
ns2006375
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2015812.5
ns2025083
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
505179
ns496955.5
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
28452120
ns27423881
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1086300
ns1040810
ns1.04
This comment was automatically generated by workflow using github-action-benchmark.