This repository has been archived by the owner on Nov 4, 2024. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
9510cfa
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LuxLib Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5979
ns5479.5
ns1.09
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6083
ns6375
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
8208
ns8000
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
7209
ns6375
ns1.13
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
118214
ns119198
ns0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
2759609
ns2649209
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal
712042
ns704000
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
604929
ns417764
ns1.45
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9917
ns9812
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10250
ns9625
ns1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10250
ns10042
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9937.5
ns9541
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
557097
ns551456
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
17173060
ns16841216
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal
2390459
ns2645125
ns0.90
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
692707
ns659636
ns1.05
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3041
ns1395.5
ns2.18
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1417
ns1687.5
ns0.84
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1792
ns1875
ns0.96
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1833.5
ns2521
ns0.73
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
21918
ns21867
ns1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI
1341543
ns1304894
ns1.03
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal
198917
ns212604
ns0.94
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU
32340
ns30820.5
ns1.05
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4354.5
ns4209
ns1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
3792
ns4312.5
ns0.88
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4084
ns3917
ns1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3834
ns4375
ns0.88
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
147392
ns146279
ns1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI
8702415
ns8894773.5
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal
1537625
ns1523375
ns1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU
156613
ns148982
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57792
ns57542
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46667
ns46584
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47042
ns39875
ns1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83416.5
ns83708
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37067
ns36787
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
549762
ns582007
ns0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1031250
ns985625
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
84262
ns84391
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2026542
ns2036583
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2089291
ns2086750
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2082209
ns2079917
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2004562
ns1987312.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
229332
ns227214
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
7560863
ns7854957
ns0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
4611833
ns7818750
ns0.59
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1299247
ns967560
ns1.34
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
155125
ns154083
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
146459
ns146958
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
149416
ns149979.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
171583
ns165187.5
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
166520
ns166381
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7685074
ns7795058
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1471541.5
ns1464583
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
220947
ns207072
ns1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1112687.5
ns1110895.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1109250
ns1103209
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1110041.5
ns1118687
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1115354
ns1109562.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
715343.5
ns711437
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
34840940
ns33922938.5
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
6342708
ns6051917
ns1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
970326.5
ns1036360
ns0.94
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4708
ns5208
ns0.90
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5812
ns4271
ns1.36
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6125
ns5375
ns1.14
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5333
ns4584
ns1.16
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
94815
ns94268
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
5182206
ns5136056
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal
655209
ns711583
ns0.92
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
71833
ns69481
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8625
ns8667
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8625
ns8500
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8708
ns8917
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8542
ns8333
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
606843
ns603970
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
38524595
ns33683319.5
ns1.14
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal
5543417
ns5821292
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
415994
ns389889
ns1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18125
ns17729.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18229.5
ns20042
ns0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21167
ns20584
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18958
ns20416.5
ns0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
67302
ns66995
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3257619
ns2897295
ns1.12
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1274125
ns1301292
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
85063
ns73931
ns1.15
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
211791
ns211625
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
212833
ns218875
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220396
ns218667
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
219479
ns224875
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
359730
ns357740
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
13755121.5
ns14308445
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
5707854
ns5704396
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
496728
ns473855
ns1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
750
ns625
ns1.20
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
770.5
ns666
ns1.16
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
812.5
ns750
ns1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
625
ns666
ns0.94
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
20915
ns20965
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI
1138496
ns1157358.5
ns0.98
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal
283312.5
ns283542
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU
34981
ns32571
ns1.07
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1458
ns1375
ns1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1458
ns1375
ns1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1458
ns1500
ns0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1375
ns1334
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
127115.5
ns125947
ns1.01
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI
8947782
ns8433349.5
ns1.06
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal
1487333
ns1594979.5
ns0.93
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU
140405
ns138471
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7292
ns7334
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6083
ns6125
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6042
ns5333
ns1.13
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10417
ns10417
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
24005
ns23836
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1270299
ns1232101.5
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
467959
ns583125
ns0.80
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
48862
ns46460
ns1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
234875
ns227708
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
227500
ns235583
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
265083
ns264667
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
249333
ns248583
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
190814
ns190580
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
30562210
ns29562269.5
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
8899916.5
ns8564854.5
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
676226
ns611281
ns1.11
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4125
ns4084
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4084
ns4084
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4084
ns4125
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4084
ns4125
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23878
ns23789
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI
1989779
ns2018577
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal
221875
ns219791.5
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU
52132
ns50370
ns1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16833
ns16958
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16917
ns17083
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
17083
ns17083
ns1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16708
ns16666
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
198115
ns197449
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI
10848086
ns9693737.5
ns1.12
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal
921750
ns940458
ns0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU
188130
ns176226.5
ns1.07
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
509958
ns509500
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
404375
ns405083
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
404666
ns332459
ns1.22
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
864708.5
ns865125
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113137
ns113130
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI
406001
ns391060
ns1.04
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal
454833
ns451416
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU
252759
ns248703
ns1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2321792
ns2324333
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
2031312.5
ns2025375.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
2026292
ns1752833.5
ns1.16
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3197437.5
ns3200583
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
243343
ns244865
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI
12608189
ns11656548
ns1.08
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal
1833333
ns1966229
ns0.93
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
766712
ns761317.5
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6354.5
ns6250
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7541.5
ns6145.5
ns1.23
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8250
ns7729
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6792
ns6375
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
92699
ns93009
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
5474342
ns5406797
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal
770459
ns758167
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
62268
ns60110
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10416
ns10646
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12500
ns10542
ns1.19
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11875
ns11084
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11750
ns10375
ns1.13
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
650339
ns660576
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
37332074.5
ns38819677
ns0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal
5429250.5
ns5487104
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
444798
ns416424
ns1.07
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns541
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
541
ns542
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23458
ns23635
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI
2134920
ns2221310
ns0.96
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal
321917
ns319750
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU
54209
ns53401
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2083
ns2083
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2209
ns2083
ns1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2208
ns2084
ns1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2125
ns2125
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
230504
ns232566
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI
10796849.5
ns11381984
ns0.95
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal
1959959
ns1912541.5
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU
184907
ns186466.5
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8916
ns8375
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
9250
ns8750
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
11083
ns10438
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
9042
ns8958
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
102829
ns104173
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
3264654
ns3244842
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal
786812
ns896708
ns0.88
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
78719
ns74231
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17333.5
ns17708
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
18562.5
ns17750
ns1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
19000
ns18187.5
ns1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
18729
ns18041.5
ns1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
573932.5
ns610296
ns0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
17434078.5
ns17126722
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal
5154166.5
ns5229458
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
408228
ns387209
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
625
ns500
ns1.25
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
583
ns625
ns0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns541
ns1.16
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns500
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
35613.5
ns35555
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
1174612
ns1100087
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal
443875
ns438541
ns1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
50200
ns47930
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8875
ns9312
ns0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10021
ns8125
ns1.23
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9792
ns9792
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9542
ns9146
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
253397
ns256000
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
17680433
ns19311232
ns0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal
4449708.5
ns4774937.5
ns0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
414840
ns378844
ns1.10
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397542
ns397000
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
287875
ns288125
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
287958
ns215667
ns1.34
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
756250
ns756875
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
112466
ns111981
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI
323442
ns320003
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal
468208.5
ns365500
ns1.28
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU
80790
ns78230
ns1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1450500
ns1460875
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1136437
ns1135291.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1133813
ns862687.5
ns1.31
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2359041.5
ns2357291
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
208101
ns209166.5
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI
10207256
ns9267436
ns1.10
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal
1607895.5
ns1516312.5
ns1.06
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU
333532
ns323643
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7375
ns6667
ns1.11
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7604
ns6959
ns1.09
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
9188
ns8958.5
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7417
ns7334
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
138055
ns144567
ns0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
5764079.5
ns5867002
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal
447958
ns707270.5
ns0.63
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
62411
ns70660
ns0.88
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13812
ns15395.5
ns0.90
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14667
ns12417
ns1.18
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14875
ns14250
ns1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14167
ns13312
ns1.06
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
978862
ns958993.5
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
42062469
ns40369162
ns1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal
5817291.5
ns5752729.5
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
461364
ns433804
ns1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
25354.5
ns24416
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
30458.5
ns26417
ns1.15
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
29708
ns28687
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
25458.5
ns26874.5
ns0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
202433.5
ns201880.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7728083
ns8100056
ns0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
921145.5
ns896833
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
121551
ns114876.5
ns1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
152834
ns148834
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
149583
ns104708
ns1.43
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
147083
ns153500
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
153917
ns116979
ns1.32
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1086649
ns1086710
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
41913933
ns41151661
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
5704750
ns5843229.5
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
620779
ns594985
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
75750
ns73958
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
78187.5
ns76791.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
84895.5
ns80166
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
85458
ns75417
ns1.13
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
208740.5
ns207189
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7587978
ns7362606
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
522250
ns519687.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
136042
ns126391.5
ns1.08
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
290000
ns297334
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
286937
ns221667
ns1.29
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
302125
ns288917
ns1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
279416
ns221041.5
ns1.26
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1130753
ns1119401
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
43569010
ns41008184.5
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6840729
ns6497687.5
ns1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
727137.5
ns694627
ns1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
17167
ns16417
ns1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
17084
ns16583
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
17417
ns17792
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
16291
ns16708
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
148129
ns147421
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
5485127.5
ns5759467
ns0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal
471208
ns427292
ns1.10
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
243625
ns237703
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
27270.5
ns24833.5
ns1.10
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
27333
ns27042
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
26979
ns27166.5
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
27916.5
ns27125
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
988689
ns984196
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
41332749
ns40719457
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal
5697083
ns5828333
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
730974
ns714022
ns1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11542
ns11562.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12250
ns10375
ns1.18
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
11750
ns12083
ns0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
10583
ns11083
ns0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
124848
ns124895.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
3576014.5
ns3575871
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal
857292
ns912833
ns0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
246370.5
ns242943
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
21167
ns21125
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
22562
ns21917
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
22292
ns22000
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
22041
ns21416
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
707198
ns706086.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
23127801
ns21428227.5
ns1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal
5332000
ns5387146
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
697815
ns673547
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
63500
ns64000.5
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
63187.5
ns63500
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
67604
ns66166
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
66000
ns62584
ns1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
106420
ns105629.5
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3379318
ns3434086.5
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1310604
ns1323250
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
244896
ns237572
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
438125
ns448750
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
455375
ns437958
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
478166
ns446666
ns1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
451334
ns449583
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
518775.5
ns517219
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
20304887
ns21208755
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6140333
ns5978042
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
757127
ns730458
ns1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7979
ns6958.5
ns1.15
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8041.5
ns6833
ns1.18
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7500
ns8041
ns0.93
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7604
ns7771
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
146523.5
ns145909.5
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
5590898
ns5602766
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal
435708
ns628395.5
ns0.69
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
72661
ns58991
ns1.23
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13333
ns14042
ns0.95
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16917
ns15750
ns1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15270.5
ns13917
ns1.10
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15083.5
ns13479
ns1.12
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
956333
ns954313
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
38910703
ns38432249.5
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal
5504417
ns5549500
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
444671
ns404584
ns1.10
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
6153583
ns6160416
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
6374041
ns6378167
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
6373813
ns3224791.5
ns1.98
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
11924250
ns11924000
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
350545.5
ns301800.5
ns1.16
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU
305468
ns294983
ns1.04
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
19103895.5
ns19104958
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
19992375
ns19957229
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
19908604
ns11123708.5
ns1.79
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
36548541.5
ns36532604
ns1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1015793.5
ns1023618
ns0.99
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU
1217321
ns1158122
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
959
ns917
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1000
ns958
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1000
ns958
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1000
ns959
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23465
ns23554
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI
2075509
ns2143802
ns0.97
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal
318917
ns316188
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU
221376
ns215672
ns1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
3667
ns3625
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
3792
ns3667
ns1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
3750
ns3666
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
3625
ns3666
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
279973
ns283503
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10954168
ns11257238
ns0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal
2079042
ns2086333.5
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
643436.5
ns637297
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8000.5
ns8000
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
9292
ns7958
ns1.17
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9395.5
ns9042
ns1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
8000.5
ns7854
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
120927.5
ns120818.5
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
3366103
ns3517154
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal
786604.5
ns776959
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
72332
ns67641
ns1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11875
ns11729.5
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
12417
ns12250
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
13125
ns12334
ns1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
12229.5
ns12458.5
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
645035
ns643501
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
21375165.5
ns21447178
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal
4983459
ns5189125.5
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
388705.5
ns365334
ns1.06
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
291
ns250
ns1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns291
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns291
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22899
ns22596
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI
2132112.5
ns1951713
ns1.09
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal
224458
ns225750
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU
53171
ns52251
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2875
ns3041
ns0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
3042
ns3208
ns0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3208
ns3375
ns0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
3083
ns3042
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
204868.5
ns204741
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI
10211479.5
ns9227567
ns1.11
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal
1621667
ns1619250
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU
168474.5
ns172842
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12042
ns11250
ns1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11458
ns11334
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
12958
ns13125
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11375
ns11458
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
123020.5
ns121547.5
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
3354460
ns3353104
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal
888334
ns869041
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
244727
ns243193
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
20625
ns22000
ns0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
22875.5
ns20583
ns1.11
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
23937
ns21167
ns1.13
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
21395.5
ns20791
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
601559
ns598450
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
19862650.5
ns19931223.5
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal
4446729
ns4695229
ns0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
689959
ns652706.5
ns1.06
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4375
ns4375
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4458
ns4417
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4375
ns4416
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4375
ns4416
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
24235
ns24359
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI
2133456
ns2166080
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal
221791
ns223833
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU
58031
ns52541
ns1.10
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16500
ns16667
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16583
ns16500
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16708
ns16375
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16417
ns16333
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
332460.5
ns331128
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI
12635929.5
ns12599810
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal
1070583
ns1647875.5
ns0.65
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU
227946
ns212037.5
ns1.08
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
2083
ns1959
ns1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
2167
ns2083
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
2208
ns1958
ns1.13
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
2083
ns1958
ns1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
36306
ns35684
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
1295447
ns1146851
ns1.13
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal
445333
ns441458.5
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
212122.5
ns206802
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
17479
ns16645.5
ns1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
18583
ns16750
ns1.11
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
17562.5
ns16562.5
ns1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
16375
ns17208.5
ns0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
295313
ns294264.5
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
22051076
ns20813859
ns1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal
5278167
ns5292083
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
731771
ns703797.5
ns1.04
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
59500
ns59583.5
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
65395.5
ns63625
ns1.03
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
66271
ns62625
ns1.06
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
51083
ns51292
ns1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66116
ns66405
ns1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU
117509
ns103511
ns1.14
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
135271
ns199395.5
ns0.68
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
165604.5
ns157250
ns1.05
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
124646
ns133937.5
ns0.93
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
319416
ns317729
ns1.01
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
217110
ns216342
ns1.00
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU
617880
ns579316
ns1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
129792
ns82458.5
ns1.57
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
123708
ns85271
ns1.45
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
88417
ns90209
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
128333
ns140417
ns0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
192466
ns192334
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5745668.5
ns5533381
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1821459
ns1893708
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
173229
ns170101.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1916917
ns1851687.5
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1914042
ns1882334
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1900791
ns1926500
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1910792
ns1891958.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
534488
ns532324
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
26461012
ns25979046
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9244791
ns9683125
ns0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1111539.5
ns1080090
ns1.03
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns291
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns291
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
291
ns292
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
21568
ns21761
ns0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI
2163626
ns2115738
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal
348458
ns346875
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU
48829
ns45220
ns1.08
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1833
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1875
ns1750
ns1.07
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1875
ns1792
ns1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1792
ns1792
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
252538
ns253104
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI
9959798
ns9490240.5
ns1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal
1489771
ns1088979
ns1.37
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU
206024.5
ns187502
ns1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
9042
ns8084
ns1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9542
ns8438
ns1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
11000.5
ns10875
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
11083
ns8209
ns1.35
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
119681.5
ns119061
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
3343318
ns3459549.5
ns0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal
860042
ns880209
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
245750
ns237872
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9125
ns10167
ns0.90
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
11104
ns9208
ns1.21
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
12270.5
ns9500
ns1.29
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9104
ns9167
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
528697
ns527070
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
19440576
ns18222497.5
ns1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal
4311875
ns4417458
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
666771
ns634411
ns1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58000
ns58417
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47084
ns46333
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46333
ns39500
ns1.17
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83584
ns84083
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
40198.5
ns39770
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1413057.5
ns1341281.5
ns1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1100291.5
ns1100583.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
79400.5
ns75935.5
ns1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1925417
ns1901542
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1984708
ns1921833.5
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1971416
ns1955833
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1889167
ns1881792
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
221269
ns221320
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
34580695
ns33766076
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11340000
ns11588792
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1053058
ns1036440
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
418729
ns415958
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
419792
ns420042
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
420833
ns419875
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
417875
ns418708
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
211247
ns210156.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7703163.5
ns7606443
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
523750
ns522750
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
291621
ns287858
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
680812.5
ns764709
ns0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
671896
ns781812
ns0.86
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
675250
ns753417
ns0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
792167
ns678791.5
ns1.17
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1061378
ns1059447
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
45261299.5
ns43854665.5
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6367250
ns6323063
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
972305
ns916300
ns1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
3461166.5
ns3425978.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
3461270.5
ns3451792
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
3443750
ns3458979.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
3438417
ns3412708
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
172022
ns170950
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8433919
ns8189493
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1372979
ns1396875
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
422223
ns435150
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
6199166
ns6194166.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
6213333
ns6230791.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
6204084
ns6222854
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
6232604.5
ns6218875
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1006628
ns1001834
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
51512599
ns49254606
ns1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
7379313
ns8528604
ns0.87
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1595282
ns1556125
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
474750
ns472667
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
341000
ns339875
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
340375
ns253208
ns1.34
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
903500
ns902000
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46814
ns46534
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI
863564
ns886552
ns0.97
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal
430166
ns478875
ns0.90
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU
252863
ns249963
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2330583
ns2333750
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
2039374.5
ns2036625
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
2036396
ns1763167
ns1.15
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3200125
ns3203312
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
269438
ns258879
ns1.04
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI
14982086
ns13032420
ns1.15
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal
2160770.5
ns2178375
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
790237
ns787818
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57395.5
ns57542
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46042
ns45875
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
45959
ns39458
ns1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82958
ns83791
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
28489
ns28376
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1441103.5
ns1391893
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1102062
ns1124083
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
79270.5
ns77840.5
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2027000
ns2032250
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2084854
ns2093187.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2075437.5
ns2091917
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1975063
ns1972229.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
234775
ns235913
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
35760405
ns35452366
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11437417
ns11558395.5
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1093992
ns1056250.5
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57416
ns57708
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46583
ns46625
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46375
ns39875
ns1.16
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83333
ns83916.5
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
49617
ns49455
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
771905.5
ns809068
ns0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1082958
ns1084875
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
78170
ns72105.5
ns1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1875541
ns1921083
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1977333
ns1945916.5
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1974500
ns1974729.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1878333
ns1864791
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
239355
ns238800.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
16395409.5
ns17238198
ns0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9631250
ns10023791.5
ns0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
975732
ns934629
ns1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
333
ns291
ns1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns292
ns1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
417
ns333
ns1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
333
ns333
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
34934
ns34886
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
1184614.5
ns1200155
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal
274562.5
ns279833
ns0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
51811
ns48281
ns1.07
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6708
ns6792
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7084
ns6208.5
ns1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7291.5
ns7000
ns1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7000
ns6667
ns1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
210987
ns212384.5
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
20992732
ns19751565
ns1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal
4248354.5
ns5078916.5
ns0.84
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
406385
ns379104
ns1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns250
ns1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns291
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32512
ns32763
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI
1266942
ns1167700
ns1.08
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal
249625
ns253542
ns0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU
46045.5
ns41150
ns1.12
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
3000
ns3833
ns0.78
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
3083
ns3041
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
3042
ns3375
ns0.90
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2875
ns3125
ns0.92
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
189582.5
ns190584.5
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI
7324802.5
ns7912209
ns0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal
935896
ns1265542
ns0.74
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU
180597.5
ns153656.5
ns1.18
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
455354.5
ns454937
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
448250
ns454750
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
428521
ns458229
ns0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
445375
ns427188
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
137896
ns138010.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5931991
ns5819207
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1997916
ns2011000
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
395586
ns325693
ns1.21
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3804104
ns3801708.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3808416
ns3811125
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3806541.5
ns3821292
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3799792
ns3815375
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
711866.5
ns710674
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
32500206
ns32043185
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11123208.5
ns10832625.5
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1305610
ns1491590
ns0.88
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
49878875
ns49856479
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
35553750
ns35516042
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
35552083
ns26022291
ns1.37
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
97154833
ns97102959
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1599254.5
ns1594251.5
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU
1094408
ns1009650
ns1.08
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
154338208.5
ns154623520.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
112497125
ns112350625
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
112457084
ns89065125
ns1.26
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
295404416.5
ns296081125
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6447376
ns6489845.5
ns0.99
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU
5620571
ns5556104
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
19563
ns17312.5
ns1.13
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
17833
ns16834
ns1.06
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
16833
ns14291.5
ns1.18
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
15458.5
ns15167
ns1.02
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
21582
ns21687
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI
1105119.5
ns1157478.5
ns0.95
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal
219625
ns218167
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU
29820
ns27541
ns1.08
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
10792
ns11042
ns0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
8958
ns9000.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
9146
ns7875
ns1.16
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
17479
ns17416.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
261508.5
ns261161
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI
10504088.5
ns9552185
ns1.10
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal
1527687.5
ns1560042
ns0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU
159993
ns155181
ns1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8916.5
ns8125
ns1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8958
ns8084
ns1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
10541
ns10083.5
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
9084
ns8542
ns1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
116829.5
ns116504
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
3509968
ns3349407.5
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal
790562.5
ns798667
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
248174
ns238952.5
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9541.5
ns9854
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9833
ns10229.5
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10375
ns10083
ns1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9604.5
ns9958
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
626007
ns623888
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
21525719
ns22194230
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal
4884770.5
ns4515667
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
702793
ns656976
ns1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10042
ns9520.5
ns1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10000
ns9125
ns1.10
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
11125
ns11625
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9729.5
ns9479.5
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
121691.5
ns120769
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
3278367.5
ns3531092
ns0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal
870854
ns888291
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
79151
ns79170
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13291
ns14208
ns0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
14228.5
ns13208.5
ns1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
15250
ns16333
ns0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
15167
ns17000
ns0.89
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
595764
ns594781
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20421555
ns19851682
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal
4551104
ns4474458
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
379417
ns357348.5
ns1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
459
ns500
ns0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
583
ns459
ns1.27
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
666
ns458
ns1.45
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
458
ns500
ns0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
34966
ns34855
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
1172686
ns1184802
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal
273437.5
ns423042
ns0.65
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
209974
ns209842
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7145.5
ns7709
ns0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7625
ns7084
ns1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8834
ns7708
ns1.15
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8396
ns8042
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
232144
ns231568.5
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
23454434.5
ns22217593.5
ns1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal
4465875
ns5660167
ns0.79
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
709093
ns679867
ns1.04
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
16417
ns16042
ns1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
15417
ns15333
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
14666
ns13854
ns1.06
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
10583
ns10375
ns1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
22370
ns22215
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI
1145949.5
ns1158702.5
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal
198667
ns205521
ns0.97
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU
192503.5
ns194012
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
31709
ns31958
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
32250
ns32145.5
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
32250
ns32250
ns1
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
31750
ns32250
ns0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
275284
ns276502.5
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI
12020472
ns11085623
ns1.08
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal
1736729.5
ns1721729
ns1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
607877
ns605276.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
440083
ns474834
ns0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
450750
ns445167
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
445500
ns486875
ns0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
443521
ns474916
ns0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194554
ns194410
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6201210
ns5748288
ns1.08
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1957625
ns2751937.5
ns0.71
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
370197
ns326354
ns1.13
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3828541
ns3823792
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3823666
ns3824042
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3797896
ns3849500
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3830916
ns3847584
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
542025
ns546410
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
27662239
ns27926309
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9597062.5
ns10140750
ns0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1406828
ns1388348.5
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
835622667
ns782652917
ns1.07
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
542659916
ns542161792
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
543414375
ns420966458.5
ns1.29
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
1511375063
ns1553203729.5
ns0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22747194
ns22558411.5
ns1.01
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU
14291922
ns14062784.5
ns1.02
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
3023342458
ns2518008250
ns1.20
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1783442458
ns1785714792
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1798462750
ns1525039667
ns1.18
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
4766247958
ns4874366334
ns0.98
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
367462031
ns367235490
ns1.00
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU
88049233
ns88231178
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76166.5
ns77646
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
77000
ns75959
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
79625
ns82625
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
77458
ns77291
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
213561
ns208602.5
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7926068
ns8336540
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
520125
ns525229
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
113802.5
ns109211
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
203959
ns199042
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
282979
ns262396
ns1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
195667
ns276625
ns0.71
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
203125
ns287458
ns0.71
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1066917
ns1056833
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
42732466
ns40754174
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6048208
ns6090583
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
677885
ns646691
ns1.05
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
199952791.5
ns199913000
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
139336834
ns139280375
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
139226875
ns104140916
ns1.34
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
389244000
ns389020708
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5812725
ns5827400
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU
3450504
ns3419864.5
ns1.01
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
619039062
ns620313062.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
439416041
ns440225000
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
438636083.5
ns352767458
ns1.24
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
1188607167
ns1182963541
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
26431639
ns26862507
ns0.98
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU
21894468
ns21755438
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7333
ns7292
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6250
ns6083
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6083
ns5291
ns1.15
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9959
ns10041
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
28527
ns28028
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1247952
ns1272660
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
430792
ns627458
ns0.69
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
53561
ns48010
ns1.12
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
243708
ns220750
ns1.10
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220250
ns220521
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
222125
ns221875
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
207959
ns209208.5
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
225129
ns222206
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
30977876
ns29719216
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
9232312.5
ns9434666.5
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
578042
ns527475
ns1.10
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
9396
ns8458.5
ns1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9500
ns9209
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10771
ns10375
ns1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
9583
ns8083
ns1.19
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
121662
ns119377.5
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
3360687
ns3449983
ns0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal
875708
ns855000
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
75411
ns72520
ns1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7208
ns8958.5
ns0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8125
ns7500
ns1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9584
ns10084
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8917
ns10187.5
ns0.88
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
529854.5
ns521950
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
19211637
ns18008002
ns1.07
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal
4408249.5
ns4315292
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
346897
ns321943
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
500
ns625
ns0.80
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
500
ns458
ns1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
750
ns625
ns1.20
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
625
ns625
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
26993
ns26701
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
1174455.5
ns1195571.5
ns0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal
467146
ns459104
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
52391
ns48701
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
8584
ns10375
ns0.83
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
8792
ns8479
ns1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10834
ns11375
ns0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15375
ns9375
ns1.64
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
257305
ns252977
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
23468849
ns24052360
ns0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal
5677333.5
ns5702709
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
442295
ns397983.5
ns1.11
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
106875
ns106500
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
99584
ns98125
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
100792
ns87479.5
ns1.15
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
146750
ns147229
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
25363
ns24863
ns1.02
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI
1177107
ns1228355
ns0.96
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal
262833
ns263458.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU
194834
ns190212
ns1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
478584
ns478667
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
492000
ns509250
ns0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
483042
ns518562.5
ns0.93
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
516875
ns520417
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
236865
ns234381
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI
11915915
ns11772054
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal
2088333
ns2148312.5
ns0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
625564.5
ns621156
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
7583
ns5375
ns1.41
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
5833
ns5167
ns1.13
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
7458
ns7500
ns0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
4562.5
ns4833.5
ns0.94
batchedmm(16, Bsize=32)/forward/GPU/CUDA
16602
ns16136
ns1.03
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU
79982
ns79061
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
11750
ns14083
ns0.83
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
11208.5
ns10208.5
ns1.10
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
11750
ns10292
ns1.14
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
17458.5
ns16708
ns1.04
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
216645
ns213958
ns1.01
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU
391539
ns374963
ns1.04
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
39458
ns40000
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
51292
ns50584
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
52750
ns52458.5
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
13792
ns13895.5
ns0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA
21950
ns19866
ns1.10
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU
94612
ns87035.5
ns1.09
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
42187.5
ns38625
ns1.09
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
32750
ns30646
ns1.07
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
33000
ns30791.5
ns1.07
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
58271
ns57666
ns1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
195696
ns192524
ns1.02
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU
447525.5
ns416745
ns1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
1583.5
ns1604.5
ns0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
1917
ns1791
ns1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
2333
ns2042
ns1.14
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
1812.5
ns1708
ns1.06
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
21498
ns21123
ns1.02
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI
1151336
ns1140764
ns1.01
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal
293083.5
ns294500
ns1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU
38250.5
ns30391
ns1.26
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
2125
ns2042
ns1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
2312.5
ns2125
ns1.09
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
2208
ns2292
ns0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
2208
ns2208
ns1
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
206694
ns205122.5
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI
8887387.5
ns8519681
ns1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal
1497979
ns1638500
ns0.91
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU
147833
ns139726.5
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5291.5
ns5709
ns0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5250
ns5104
ns1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6791.5
ns5750
ns1.18
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5562.5
ns4271
ns1.30
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
148541
ns146388.5
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
5839841.5
ns5488369.5
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal
451916
ns465291
ns0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
75451
ns72161
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8541.5
ns8479.5
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8709
ns8209
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9334
ns8750
ns1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9000
ns9042
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
897209
ns884256.5
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
39004376
ns38177021
ns1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal
5527666
ns5496125
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
417789
ns394569
ns1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
56875
ns56791
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
57750
ns57625
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
57542
ns56875
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
57916
ns58166
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
38278
ns37427.5
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1212677.5
ns1210467.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
466125
ns468667
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
208395
ns208482
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
451167
ns487354.5
ns0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
477500
ns501250
ns0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
469250.5
ns492208.5
ns0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
444875
ns437438
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
271103
ns267413
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26939835
ns26782051.5
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
8197917
ns8248375
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
826719.5
ns839679
ns0.98
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
3309791
ns3311333.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
2333708
ns2340166.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
2341958
ns1769958
ns1.32
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
6325125
ns6319645.5
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
206333.5
ns205610
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU
225750.5
ns202712
ns1.11
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
11474875
ns11497979
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
8336979
ns8319667
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
8319292
ns6588125
ns1.26
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
21266187.5
ns21221896
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
735499
ns736463
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU
1123791.5
ns1065445
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7209
ns5562.5
ns1.30
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5145.5
ns4666.5
ns1.10
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6542
ns6437.5
ns1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6792
ns6104
ns1.11
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
141393
ns139569.5
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
5544700.5
ns5734965.5
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal
751271
ns826042
ns0.91
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
61961
ns59531
ns1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7500
ns9333.5
ns0.80
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7375
ns7000
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10417
ns11875
ns0.88
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10458
ns8708
ns1.20
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
772087
ns764194
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
34449457
ns34028843.5
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal
5194250
ns5176312.5
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
413259.5
ns378403
ns1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
101625
ns99625
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
116167
ns136708
ns0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
97521
ns101312.5
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
129583
ns129709
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
152327.5
ns151420
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5751979.5
ns6034399
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
2023792
ns1982667
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
209935
ns206692
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2000458
ns2031041
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2039084
ns2037417
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2004708
ns2036291
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2042146
ns2038584
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
718327
ns708221
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31776627
ns31488037
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11003520.5
ns11251291
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1109931
ns1126246
ns0.99
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
33688
ns33459
ns1.01
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
36229.5
ns36750
ns0.99
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
36000
ns33833
ns1.06
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
708
ns667
ns1.06
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15790
ns15506
ns1.02
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU
93042
ns86920
ns1.07
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2625
ns4792
ns0.55
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2958
ns2709
ns1.09
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
3583
ns3167
ns1.13
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2979.5
ns2291.5
ns1.30
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
143400.5
ns140769.5
ns1.02
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU
389544.5
ns351474
ns1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7250
ns7250
ns1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6041
ns6000
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5708
ns5375
ns1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10041
ns10000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
37768.5
ns36795
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1165405
ns1247042.5
ns0.93
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
353500
ns351333
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
52952
ns49030
ns1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
213770.5
ns213334
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220312.5
ns220166.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
222750
ns228125
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
207646
ns206875
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
248646
ns244945
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26791984
ns24969632
ns1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7867770.5
ns7965166.5
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
615929.5
ns578090.5
ns1.07
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3917
ns3916
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3958
ns3959
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3958
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3958
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22442
ns21762
ns1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI
2173165.5
ns2067928.5
ns1.05
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal
240291
ns245104
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU
48101
ns45631
ns1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14834
ns14875
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15000
ns14916
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14917
ns14667
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14666
ns14667
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
315075
ns310256.5
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI
11343180
ns11269459
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal
1001479.5
ns1000292
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU
210165
ns193502
ns1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
135792
ns102917
ns1.32
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
104291
ns103667
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
105021
ns108625
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
136667
ns131875
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
146027
ns137366.5
ns1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6157059
ns5955500.5
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1925791
ns1988958
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
190028
ns200842
ns0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1910209
ns1926354.5
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1920083
ns1913500
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1909604.5
ns1917792
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1930625
ns1936729
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
700646
ns692519
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
30420268
ns33116808.5
ns0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
10973125
ns11144584
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1244371
ns1078360.5
ns1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17833
ns17708
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18146
ns22291.5
ns0.81
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19834
ns21250
ns0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
19958
ns19146
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
111591.5
ns109241
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3532207.5
ns3392625.5
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
1347125
ns1271125
ns1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
87019
ns81331
ns1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
217604.5
ns221229.5
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
228270.5
ns216791
ns1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
218417
ns230083.5
ns0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215833
ns216083.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
530706
ns522920
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
20900864.5
ns19545470
ns1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6137312.5
ns6165645.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
517847
ns476780
ns1.09
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
27333
ns26250
ns1.04
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
30104.5
ns31250
ns0.96
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
30000
ns27875
ns1.08
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
1833
ns1292
ns1.42
batchedmm(16, Bsize=4)/forward/GPU/CUDA
16696
ns16312
ns1.02
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU
95970
ns87751
ns1.09
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
4375
ns6625
ns0.66
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
4895.5
ns4645.5
ns1.05
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
6000
ns4917
ns1.22
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
4834
ns4792
ns1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
210645.5
ns207882.5
ns1.01
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU
420858
ns402074
ns1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
306167
ns305938
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
305959
ns305917
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
307583
ns307521
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
305542
ns305375
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
233739
ns230214
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7904754
ns7500239
ns1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
570771
ns643000
ns0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
284229
ns280903
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
581667
ns538541
ns1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
535791
ns549750
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
534333
ns542666
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
534959
ns529708
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1101751
ns1085631
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
46425480.5
ns44253871
ns1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6055125
ns6154687.5
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
916717
ns872599
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19167
ns19021
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
20396
ns19833.5
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22375
ns22542
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
20708
ns21917
ns0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
116012
ns114174
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3553813
ns3531348.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
1402542
ns1449271
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
86330
ns81471
ns1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
217375
ns218834
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
219625
ns227542
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
214250
ns219708
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
212500
ns212708
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
762109
ns761865.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
24400401
ns24050167
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7404417
ns7412916.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
578169
ns543136
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
7500
ns7125.5
ns1.05
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7041.5
ns6479
ns1.09
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7833
ns8458
ns0.93
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
7250
ns6084
ns1.19
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
143913.5
ns141785
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
5579609.5
ns5370056
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal
787333.5
ns777458
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
71680
ns69581
ns1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10541
ns12958
ns0.81
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10645.5
ns9583.5
ns1.11
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10458
ns10687.5
ns0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10167
ns9625
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
845090
ns832452.5
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
38996778
ns38810557
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal
5177833
ns5231375
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
431434.5
ns395184
ns1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5375
ns5145.5
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5604.5
ns4812.5
ns1.16
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6667
ns6958
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6250
ns6833
ns0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
147643
ns144967.5
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
5420881
ns5514807.5
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal
770708
ns829125
ns0.93
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
64090
ns70250
ns0.91
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7667
ns7770.5
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7834
ns7333
ns1.07
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8000
ns7667
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7625
ns7208
ns1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
801461
ns790491
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
38866348
ns37869840
ns1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal
5549500
ns5670687
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
434220
ns398424.5
ns1.09
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
14438042
ns14518959
ns0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
10127917
ns10120000
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
10159833
ns7708791.5
ns1.32
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
27954459
ns27832250
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA
533907.5
ns532409
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU
478160.5
ns399949.5
ns1.20
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
46256583.5
ns46375083.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
33464500
ns33404583.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
33570375
ns26627416.5
ns1.26
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
85876709
ns85835750
ns1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2658893
ns2644453
ns1.01
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU
3344856
ns3278895
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
66792
ns66042
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
65979.5
ns66125
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
68375
ns70520.5
ns0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
66750
ns67875
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
123457.5
ns119873.5
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3450048
ns3330724
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
1415271
ns1410021
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
238451
ns229907.5
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
439917
ns453292
ns0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
473125
ns441208
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
442479
ns450208
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
441875
ns445541
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
741057
ns732886.5
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
27244548.5
ns26274297
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7814416.5
ns7781500
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
827592
ns794638
ns1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
542
ns667
ns0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
584
ns625
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns542
ns1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns500
ns1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
33032
ns32132
ns1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
1140283.5
ns1164338
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal
439500
ns431645.5
ns1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
51400
ns49160
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9125
ns8292
ns1.10
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9167
ns8708
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9667
ns9250
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9167
ns8959
ns1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
290866
ns286401.5
ns1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
29330832.5
ns21940598
ns1.34
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal
5094625
ns5096125
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
423181
ns388934
ns1.09
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
9833
ns9792
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
9833
ns9875
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
9833
ns9833
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
9792
ns9875
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
23856
ns23178
ns1.03
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI
2133194
ns1908743.5
ns1.12
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal
221187.5
ns222541
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU
221570.5
ns217383
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
46084
ns45875
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
46209
ns45917
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
46708
ns46167
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
45833
ns45875
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
295006
ns293089
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI
12015326
ns10988297.5
ns1.09
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal
1410479
ns982875
ns1.44
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
632758
ns621107
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
56208
ns56250
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
57167
ns57125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
57208
ns56334
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
57916
ns57792
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
29619
ns28527
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1174842
ns1186883
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
579563
ns578645.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
208291
ns204943
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
448604.5
ns448333
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
467334
ns494125
ns0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
465499.5
ns507583
ns0.92
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
442312
ns439437
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
251481
ns247232
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
32120348
ns33216066
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
9690584
ns9499166
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
916830
ns891519.5
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
644958
ns652937.5
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
645875
ns647333
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
633021
ns662854
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
658291.5
ns668500
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
213385
ns207996
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8467842
ns8125052.5
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1345291
ns1384354.5
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
276851.5
ns233282
ns1.19
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2217750
ns2235042
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2230604
ns2238979
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2220792
ns2248959
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2276833.5
ns2260792
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
999797
ns984096
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
48712510
ns45382984
ns1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9179667
ns8132833.5
ns1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1474035
ns1370494
ns1.08
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19709
ns20958
ns0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
19750
ns20000
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22833.5
ns22667
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21833
ns22083
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
114451.5
ns113160
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3653619
ns3278898
ns1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal
1435020.5
ns1472792
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
86210.5
ns81561
ns1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
220750
ns222313
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
256917
ns257542
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220958
ns232250
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
220125
ns228000.5
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
740583
ns734156.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
24806004
ns27357269
ns0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal
7609833
ns7692750
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
592134
ns559476
ns1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
625
ns583
ns1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
625
ns541
ns1.16
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
500
ns500
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23545
ns23248
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
1179342
ns1222462
ns0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal
451125
ns466625
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
52320
ns51870
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9708
ns9167
ns1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9542
ns9208
ns1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10271
ns9292
ns1.11
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
10042
ns9312.5
ns1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
271980
ns268568
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
23890621
ns24289416
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal
5928209
ns6049709
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
462544
ns410500
ns1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
8458
ns10333
ns0.82
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9333
ns8458
ns1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
11333
ns10354
ns1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
10625
ns8333
ns1.28
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
122249.5
ns120393.5
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
3377288.5
ns3445203
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal
857708
ns832874.5
ns1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
78766
ns72921
ns1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7479
ns7583
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8208
ns8208
ns1
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7875
ns7417
ns1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7458.5
ns7770.5
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
519030.5
ns511772
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
17279332
ns16339001
ns1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal
4021166
ns3959271
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
354777.5
ns328364
ns1.08
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1709
ns1458
ns1.17
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1542
ns1.19
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1958
ns1833
ns1.07
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1459
ns1541
ns0.95
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
22193
ns21725
ns1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI
1121843
ns1136020
ns0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal
293083
ns296000
ns0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU
196742
ns194712
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
3292
ns3250
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
3458
ns3250
ns1.06
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
3583
ns3500
ns1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
3250
ns3209
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
222520.5
ns220221.5
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI
9986789
ns9698879
ns1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal
1730167
ns1612667
ns1.07
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
596665
ns596166
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
147979
ns148167
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
128500
ns127709
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
129874.5
ns107958.5
ns1.20
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
226000
ns225958
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
24573
ns24338
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI
1208173
ns1138772
ns1.06
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal
293667
ns270854.5
ns1.08
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU
40630
ns40151
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
142979.5
ns156125
ns0.92
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
110791
ns127209
ns0.87
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
110604.5
ns100750
ns1.10
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
252042
ns256666.5
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
220880.5
ns218905
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI
10430562
ns10030041
ns1.04
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal
1988667
ns2003417
ns0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU
256232
ns240417.5
ns1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7292
ns7292
ns1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5958
ns6083
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6041
ns5375
ns1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10125
ns10375
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33867
ns32865
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1251518.5
ns1134920.5
ns1.10
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
348625
ns562875
ns0.62
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
54871
ns52191
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
246687.5
ns230854.5
ns1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
263750
ns270500
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
228041
ns264875
ns0.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214750.5
ns213771
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
267151.5
ns263381.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
27588902
ns28212764
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
8321542
ns8517000
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
575196
ns607266
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
15875
ns14958
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
15208
ns15500
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
16937.5
ns16500
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
14959
ns15625
ns0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
143325
ns140749.5
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
5498476
ns5465169
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal
761333
ns787125
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
239812
ns238512
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24000
ns22583
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
23750
ns23500
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24437.5
ns24084
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
23666
ns23167
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
885563.5
ns875101
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
39845026.5
ns37582744
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal
5521500
ns5600270.5
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
720238
ns692048
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
9312.5
ns9125
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
10125
ns9250.5
ns1.09
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
11229
ns10521
ns1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
9833
ns9209
ns1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
126504.5
ns124561
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
3392484.5
ns3393331
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal
805521
ns802083
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
85091
ns79030
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13375
ns13750
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14125
ns14125
ns1
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14167
ns14125
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14042
ns13917
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
680177
ns670894
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
21529605
ns20295661
ns1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal
5139312.5
ns5274042
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
404764
ns375405
ns1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9875
ns9208.5
ns1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9625
ns9167
ns1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10770.5
ns10438
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10291
ns9584
ns1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
125014
ns122339.5
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
3361307
ns3319433.5
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal
886542
ns882875
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
80481
ns75581
ns1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12417
ns12333.5
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13083.5
ns12645.5
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13041.5
ns12708
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12333
ns12708
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
563574.5
ns557225
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
18790951
ns18661226
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal
4467334
ns4435167
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
372394
ns345844
ns1.08
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
30292
ns30292
ns1
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
34479.5
ns34021.5
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
31625
ns30854.5
ns1.02
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
1813
ns1791
ns1.01
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16646
ns16303
ns1.02
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU
89151
ns82211
ns1.08
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
5396
ns5270.5
ns1.02
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
5229
ns5354
ns0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
5500
ns5375
ns1.02
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
6583
ns6625
ns0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
143042
ns140733
ns1.02
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU
400755
ns394064.5
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
375
ns250
ns1.50
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns292
ns1.28
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns250
ns1.50
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
26809
ns26135
ns1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
1181881
ns1123770.5
ns1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal
443542
ns474625
ns0.93
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
50281
ns50311
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6708
ns6375
ns1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6667
ns6145.5
ns1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7000
ns6416
ns1.09
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6500
ns6416
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
190632
ns187828
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
23906222
ns23626156
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal
5728875
ns5544437.5
ns1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
430065
ns395104
ns1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
1959
ns2042
ns0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
2083
ns2000
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
2083
ns1959
ns1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
1958
ns2000
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
27026
ns26544
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
1206962
ns1165809
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal
452063
ns461708.5
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
211793
ns209972
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
16125
ns15792
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17333
ns16375
ns1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17125
ns17000
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
16791.5
ns16084
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
279731.5
ns275962
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
24291591
ns24890960.5
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal
5504875.5
ns5972833
ns0.92
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
753090
ns713667.5
ns1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
151625
ns178250
ns0.85
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
151333.5
ns184187.5
ns0.82
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
151250
ns153417
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
153000
ns147459
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
210001.5
ns204372
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7845014
ns7857309.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1393000
ns1392667
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
239913
ns196752
ns1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1290166
ns1326895.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1317125
ns1320625
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1254583
ns1330833
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1341125
ns1334750
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
935505
ns917280
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
46976250.5
ns46023181
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
6512417
ns6714958.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1051794
ns1108992
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
25188
ns25229.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
25833
ns26583
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
27084
ns26833
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24375
ns25917
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
241581
ns239791.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7794179
ns7972748
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal
632583.5
ns980542
ns0.65
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
120932
ns116941
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
118854.5
ns179917
ns0.66
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
117645.5
ns141604.5
ns0.83
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
119250
ns127354.5
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
118000
ns118604
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1108020
ns1092585
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
48152430
ns43816902.5
ns1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal
6196979.5
ns6033333
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
640758
ns606086
ns1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
333
ns291
ns1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
334
ns334
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns292
ns1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
250
ns292
ns0.86
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
23090
ns22970
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
1208429
ns1175116
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal
442500
ns456125
ns0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
51371
ns48591
ns1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6437.5
ns6625
ns0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6667
ns6750
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7062.5
ns6542
ns1.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6417
ns6459
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
208209
ns204628
ns1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
23792216
ns23603781
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal
5691062.5
ns6092458
ns0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
437776
ns397554
ns1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6791
ns6125
ns1.11
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6500
ns6334
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7979
ns6709
ns1.19
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6000
ns5937.5
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
149235
ns147027
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
5562220
ns5559804
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal
701208.5
ns583167
ns1.20
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
243743.5
ns237472
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10208.5
ns9666.5
ns1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10542
ns10041
ns1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10375
ns10041
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9750
ns9854
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
922802
ns910526.5
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
40299139
ns39406121
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal
5880937.5
ns5909375
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
710000
ns686288
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
708
ns666
ns1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
667
ns667
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
667
ns667
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
667
ns667
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23085
ns22655
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI
2118566
ns2037996
ns1.04
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal
222959
ns222583
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU
219753
ns215862
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4584
ns4584
ns1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4834
ns4584
ns1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4875
ns4625
ns1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4625
ns4625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
233956.5
ns232442.5
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10123278
ns9881227
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal
1643667
ns1690521
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
602789
ns600181
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8083.5
ns8562.5
ns0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8791
ns7937.5
ns1.11
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9937.5
ns9771
ns1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
8000
ns8520.5
ns0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
124933.5
ns122197
ns1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
4600589
ns3361719
ns1.37
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal
757833.5
ns761542
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
86781
ns76241
ns1.14
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8646
ns8792
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8958
ns8459
ns1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8604.5
ns8875
ns0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8812.5
ns8750
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
604520
ns595652
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
20356936
ns20278296
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal
4765146
ns4718125
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
383405
ns354274
ns1.08
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
128437.5
ns125917
ns1.02
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
129625
ns128958
ns1.01
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
130042
ns96959
ns1.34
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
182834
ns181416
ns1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46497
ns46106
ns1.01
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU
103662
ns96666
ns1.07
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
302563
ns317875
ns0.95
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
327249.5
ns346375
ns0.94
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
313708
ns178979
ns1.75
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
603874.5
ns569062.5
ns1.06
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
195592
ns191966
ns1.02
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU
538948
ns487875
ns1.10
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397167
ns397125
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288750
ns288292
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288375
ns215791
ns1.34
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
757125
ns757959
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
44085
ns43243.5
ns1.02
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI
1443281
ns1345812
ns1.07
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal
421187.5
ns404062.5
ns1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU
87081
ns83381
ns1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1454396
ns1459854
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1135375
ns1136645.5
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1132041.5
ns865270.5
ns1.31
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2362187.5
ns2359813
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
253149
ns259216
ns0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI
11724690
ns11177773
ns1.05
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal
1784562.5
ns1833666
ns0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU
361220.5
ns349653.5
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
657875
ns642333
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
636062.5
ns649875
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
645333.5
ns660416.5
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
659250
ns623542
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
206301
ns202604
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8383365
ns7957177
ns1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1348479
ns1348791.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
294724
ns265108
ns1.11
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2436771
ns2448583
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2445750
ns2452104
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2450125
ns2473833
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2475125
ns2455791
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1021725.5
ns1005284.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
53051132.5
ns50767854.5
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9126083
ns10026166
ns0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1419802
ns1511186
ns0.94
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
33812.5
ns32375
ns1.04
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
35417
ns35749.5
ns0.99
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
34708.5
ns34312.5
ns1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
833
ns916
ns0.91
batchedmm(2, Bsize=32)/forward/GPU/CUDA
16136
ns15700
ns1.03
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU
95052
ns81140
ns1.17
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
3083
ns3166
ns0.97
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
3333
ns3083
ns1.08
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
3333
ns3125
ns1.07
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
3167
ns3000
ns1.06
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
142067
ns139352.5
ns1.02
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU
389191.5
ns344664
ns1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
406166
ns405583
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
408167
ns408750
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
407833
ns403083
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
422208
ns422042
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
43553
ns43343.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1368822.5
ns1354478
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal
1102709
ns1109583
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
245914
ns240442
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3863750
ns3869125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3994375.5
ns3994396
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3960812
ns3999708
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3790666.5
ns3774354.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
247777
ns244251
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
39920104.5
ns35978667
ns1.11
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal
11733541
ns11608750
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1266155.5
ns1245273.5
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3958
ns3917
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3958
ns3917
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3875
ns3958
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
34512
ns34866
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI
1230578
ns1227111
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal
176084
ns175291
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU
43730
ns42710
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15833
ns15750
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
16042
ns15667
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
16000
ns15500
ns1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15417
ns15542
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
258709.5
ns256386
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI
8846251
ns8908913
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal
862749.5
ns872958
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU
194923
ns174412
ns1.12
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
404209
ns404166
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
295750
ns295666
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
295459
ns221625
ns1.33
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
761083
ns760500
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113512
ns113218
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI
1027530
ns1016425
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal
395541
ns393437
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU
92702
ns90851
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1482166
ns1473333
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1160146
ns1161666
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1157000
ns888166.5
ns1.30
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2385416
ns2383791
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
244710.5
ns241468.5
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI
11288335
ns11846004
ns0.95
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal
1872833
ns1877938
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU
361906
ns360704
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
500
ns500
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
583
ns583
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
584
ns459
ns1.27
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
459
ns542
ns0.85
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
26536
ns25943
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
1153344.5
ns1192515
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal
444792
ns470937.5
ns0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
213574
ns208143
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7542
ns7458
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
7791
ns7583
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8083
ns7458
ns1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7500
ns7709
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
215961
ns214477.5
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
25096072
ns25777295.5
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal
5807500
ns5998979.5
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
743182
ns700287
ns1.06
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
833166.5
ns831271
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
620875
ns617041
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
619375
ns470000
ns1.32
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
1570729.5
ns1545709
ns1.02
batchedmm(128, Bsize=32)/forward/GPU/CUDA
129541
ns129860.5
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU
179493
ns169171.5
ns1.06
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
2687271.5
ns2689145.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
2002417
ns2013250
ns0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
2005437.5
ns1538125
ns1.30
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
4951500
ns4941375
ns1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
258930
ns241461
ns1.07
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU
923056
ns867019
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
333
ns291
ns1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns333
ns1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
32385
ns31985
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
1154161
ns1142400.5
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal
273792
ns453291.5
ns0.60
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
51031
ns48580
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6417
ns6250
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6750
ns6375
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6875
ns6416
ns1.07
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6375
ns6166
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
229170
ns224593
ns1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
21853717
ns21127237.5
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal
5377542
ns5053916
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
408297
ns372504
ns1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2397334
ns2423917
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2395792
ns2397291.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2399146
ns2403792
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2411125
ns2371125
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
204989
ns203214
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7992865.5
ns8123069
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1435875
ns1393562
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
384237
ns332763.5
ns1.15
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4636500
ns4645250
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4665291.5
ns4645125
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4661083
ns4654250
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4655083
ns4658042
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
927269
ns910071
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
45794637
ns48057492
ns0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
6682583.5
ns6619584
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1310394
ns1416215
ns0.93
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
7479.5
ns7438
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
7500
ns7083
ns1.06
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7542
ns6958
ns1.08
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6708
ns6979
ns0.96
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
23953
ns23722
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI
1173979.5
ns1176238
ns1.00
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal
270854
ns263000
ns1.03
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU
42035.5
ns34150
ns1.23
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
67667
ns68020.5
ns0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
47812.5
ns50312
ns0.95
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
33917
ns53292
ns0.64
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
32979.5
ns32583
ns1.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
222148
ns218170
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI
10439799.5
ns10824043
ns0.96
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal
1991792
ns2030958
ns0.98
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU
267010
ns244333
ns1.09
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
21770.5
ns21437
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
26438
ns25333
ns1.04
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
24959
ns23479.5
ns1.06
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
5458
ns6083
ns0.90
batchedmm(2, Bsize=512)/forward/GPU/CUDA
17080.5
ns16786.5
ns1.02
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU
83551
ns91501
ns0.91
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
11958.5
ns12208.5
ns0.98
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
10958
ns10083
ns1.09
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
10583
ns9458.5
ns1.12
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
17854.5
ns17854.5
ns1
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
230933
ns228126
ns1.01
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU
405258
ns376824
ns1.08
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
406187.5
ns406500
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
297166
ns297312.5
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
297187.5
ns223791
ns1.33
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
762917
ns762958
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
46934
ns46683
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI
1403333
ns1412498.5
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal
411187.5
ns476666.5
ns0.86
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU
92831
ns89121
ns1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1505167
ns1499875
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1167833
ns1167833.5
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1168625
ns894271
ns1.31
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2388625
ns2389834
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
288977
ns292932.5
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI
14170527.5
ns13048501
ns1.09
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal
2071500
ns2098166
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU
384147
ns380285
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
433875
ns433875
ns1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
437208
ns436334
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
436750
ns430709
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
448250
ns448020.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
55350
ns54564
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1004353.5
ns1024914
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
1090667
ns1099208.5
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
239614
ns236522.5
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
3885209
ns3897208
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4024833
ns4021833
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
3930770.5
ns4027708
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
3816104
ns3812146
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
268116
ns264154
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
30218847.5
ns31494055
ns0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
10492979.5
ns10517749.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1300253
ns1245028
ns1.04
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
8791
ns8750
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
7667
ns7666
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
7667
ns6834
ns1.12
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
12416
ns12459
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
24893
ns24707
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI
2144175
ns2085760.5
ns1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal
220584
ns225250
ns0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU
221494
ns215337.5
ns1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
45125
ns45042
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
45125
ns45125
ns1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
45167
ns45083
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
45020.5
ns45187.5
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
350730
ns350283.5
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI
13707515
ns11134325
ns1.23
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal
1817750
ns1805125
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
670762.5
ns662902
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
105937.5
ns93959
ns1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
84541
ns129416
ns0.65
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
88500
ns87916.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
121479
ns125062.5
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
189955
ns189645
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5812012.5
ns5972246.5
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal
2722250
ns1906021.5
ns1.43
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
187524
ns201947
ns0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2003958
ns2011375
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2016666.5
ns2017791
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1982645.5
ns2029459
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2047125
ns2017916.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
543645
ns537811
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
28323206
ns27667805
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal
9581833.5
ns9734479.5
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
990259
ns1103102
ns0.90
This comment was automatically generated by workflow using github-action-benchmark.