Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: windows testing for dropout
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 5, 2024
1 parent da67a46 commit 9510cfa
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion test/common_ops/dropout_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,11 @@ end

soft_fail = T == Float16 ? Any[AutoFiniteDiff()] : []
skip_backends = length(x_shape) == 5 ? [AutoEnzyme()] : []
broken_backends = T == Float16 && Sys.iswindows() && length(x_shape) != 5 ?
[AutoEnzyme()] : []

test_gradients(__f, x; atol=1.0f-3, rtol=1.0f-3, soft_fail, skip_backends,
broken_backends=(T == Float16 && Sys.iswindows() ? [AutoEnzyme()] : []))
broken_backends)

@jet sum(first(dropout(
rng, x, mask, T(0.5), Val(true), Val(false), T(2), :)))
Expand Down

1 comment on commit 9510cfa

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 9510cfa Previous: 1afc1c7 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5979 ns 5479.5 ns 1.09
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6083 ns 6375 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8208 ns 8000 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7209 ns 6375 ns 1.13
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 118214 ns 119198 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2759609 ns 2649209 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 712042 ns 704000 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 604929 ns 417764 ns 1.45
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9917 ns 9812 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 9625 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 10042 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9937.5 ns 9541 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 557097 ns 551456 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 17173060 ns 16841216 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2390459 ns 2645125 ns 0.90
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 692707 ns 659636 ns 1.05
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3041 ns 1395.5 ns 2.18
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1417 ns 1687.5 ns 0.84
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1792 ns 1875 ns 0.96
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1833.5 ns 2521 ns 0.73
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 21918 ns 21867 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1341543 ns 1304894 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 198917 ns 212604 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 32340 ns 30820.5 ns 1.05
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4354.5 ns 4209 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3792 ns 4312.5 ns 0.88
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4084 ns 3917 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3834 ns 4375 ns 0.88
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 147392 ns 146279 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 8702415 ns 8894773.5 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1537625 ns 1523375 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 156613 ns 148982 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57792 ns 57542 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46667 ns 46584 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47042 ns 39875 ns 1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83416.5 ns 83708 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37067 ns 36787 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 549762 ns 582007 ns 0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1031250 ns 985625 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 84262 ns 84391 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2026542 ns 2036583 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2089291 ns 2086750 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2082209 ns 2079917 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2004562 ns 1987312.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 229332 ns 227214 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7560863 ns 7854957 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 4611833 ns 7818750 ns 0.59
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1299247 ns 967560 ns 1.34
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 155125 ns 154083 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 146459 ns 146958 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 149416 ns 149979.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 171583 ns 165187.5 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166520 ns 166381 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7685074 ns 7795058 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1471541.5 ns 1464583 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 220947 ns 207072 ns 1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1112687.5 ns 1110895.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1109250 ns 1103209 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1110041.5 ns 1118687 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1115354 ns 1109562.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 715343.5 ns 711437 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34840940 ns 33922938.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6342708 ns 6051917 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 970326.5 ns 1036360 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4708 ns 5208 ns 0.90
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5812 ns 4271 ns 1.36
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6125 ns 5375 ns 1.14
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5333 ns 4584 ns 1.16
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 94815 ns 94268 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5182206 ns 5136056 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 655209 ns 711583 ns 0.92
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 71833 ns 69481 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 8667 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8625 ns 8500 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8708 ns 8917 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8333 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 606843 ns 603970 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 38524595 ns 33683319.5 ns 1.14
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5543417 ns 5821292 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 415994 ns 389889 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18125 ns 17729.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18229.5 ns 20042 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21167 ns 20584 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18958 ns 20416.5 ns 0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 67302 ns 66995 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3257619 ns 2897295 ns 1.12
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1274125 ns 1301292 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 85063 ns 73931 ns 1.15
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 211791 ns 211625 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212833 ns 218875 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220396 ns 218667 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219479 ns 224875 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 359730 ns 357740 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 13755121.5 ns 14308445 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5707854 ns 5704396 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 496728 ns 473855 ns 1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 750 ns 625 ns 1.20
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 770.5 ns 666 ns 1.16
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 812.5 ns 750 ns 1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 666 ns 0.94
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 20915 ns 20965 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1138496 ns 1157358.5 ns 0.98
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 283312.5 ns 283542 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 34981 ns 32571 ns 1.07
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1458 ns 1375 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1458 ns 1375 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1458 ns 1500 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1375 ns 1334 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 127115.5 ns 125947 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8947782 ns 8433349.5 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1487333 ns 1594979.5 ns 0.93
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 140405 ns 138471 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7334 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 6125 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 5333 ns 1.13
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10417 ns 10417 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24005 ns 23836 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1270299 ns 1232101.5 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 467959 ns 583125 ns 0.80
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48862 ns 46460 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 234875 ns 227708 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227500 ns 235583 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 265083 ns 264667 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 249333 ns 248583 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 190814 ns 190580 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 30562210 ns 29562269.5 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8899916.5 ns 8564854.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 676226 ns 611281 ns 1.11
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4084 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 4084 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4084 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4084 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23878 ns 23789 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 1989779 ns 2018577 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 221875 ns 219791.5 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 52132 ns 50370 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16833 ns 16958 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16917 ns 17083 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17083 ns 17083 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16708 ns 16666 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 198115 ns 197449 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 10848086 ns 9693737.5 ns 1.12
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 921750 ns 940458 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 188130 ns 176226.5 ns 1.07
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 509958 ns 509500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 404375 ns 405083 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 404666 ns 332459 ns 1.22
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 864708.5 ns 865125 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113137 ns 113130 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 406001 ns 391060 ns 1.04
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 454833 ns 451416 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 252759 ns 248703 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2321792 ns 2324333 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2031312.5 ns 2025375.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2026292 ns 1752833.5 ns 1.16
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3197437.5 ns 3200583 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 243343 ns 244865 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 12608189 ns 11656548 ns 1.08
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 1833333 ns 1966229 ns 0.93
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 766712 ns 761317.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6354.5 ns 6250 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7541.5 ns 6145.5 ns 1.23
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8250 ns 7729 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6792 ns 6375 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 92699 ns 93009 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5474342 ns 5406797 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 770459 ns 758167 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 62268 ns 60110 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10416 ns 10646 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12500 ns 10542 ns 1.19
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11875 ns 11084 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11750 ns 10375 ns 1.13
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 650339 ns 660576 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 37332074.5 ns 38819677 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5429250.5 ns 5487104 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 444798 ns 416424 ns 1.07
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 541 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 541 ns 542 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23458 ns 23635 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2134920 ns 2221310 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 321917 ns 319750 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 54209 ns 53401 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2083 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2209 ns 2083 ns 1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2084 ns 1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 230504 ns 232566 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 10796849.5 ns 11381984 ns 0.95
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 1959959 ns 1912541.5 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 184907 ns 186466.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8916 ns 8375 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9250 ns 8750 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11083 ns 10438 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9042 ns 8958 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 102829 ns 104173 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3264654 ns 3244842 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 786812 ns 896708 ns 0.88
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 78719 ns 74231 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17333.5 ns 17708 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18562.5 ns 17750 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 19000 ns 18187.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18729 ns 18041.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 573932.5 ns 610296 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 17434078.5 ns 17126722 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5154166.5 ns 5229458 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 408228 ns 387209 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 541 ns 1.16
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 35613.5 ns 35555 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1174612 ns 1100087 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 443875 ns 438541 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 50200 ns 47930 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8875 ns 9312 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10021 ns 8125 ns 1.23
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9792 ns 9792 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9542 ns 9146 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 253397 ns 256000 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 17680433 ns 19311232 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4449708.5 ns 4774937.5 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 414840 ns 378844 ns 1.10
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397542 ns 397000 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 287875 ns 288125 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287958 ns 215667 ns 1.34
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756250 ns 756875 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112466 ns 111981 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 323442 ns 320003 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 468208.5 ns 365500 ns 1.28
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 80790 ns 78230 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1450500 ns 1460875 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136437 ns 1135291.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1133813 ns 862687.5 ns 1.31
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2359041.5 ns 2357291 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 208101 ns 209166.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 10207256 ns 9267436 ns 1.10
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1607895.5 ns 1516312.5 ns 1.06
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 333532 ns 323643 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7375 ns 6667 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7604 ns 6959 ns 1.09
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9188 ns 8958.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7417 ns 7334 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 138055 ns 144567 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5764079.5 ns 5867002 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 447958 ns 707270.5 ns 0.63
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 62411 ns 70660 ns 0.88
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13812 ns 15395.5 ns 0.90
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14667 ns 12417 ns 1.18
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14875 ns 14250 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14167 ns 13312 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 978862 ns 958993.5 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42062469 ns 40369162 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5817291.5 ns 5752729.5 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 461364 ns 433804 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25354.5 ns 24416 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 30458.5 ns 26417 ns 1.15
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 29708 ns 28687 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 25458.5 ns 26874.5 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 202433.5 ns 201880.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7728083 ns 8100056 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 921145.5 ns 896833 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 121551 ns 114876.5 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 152834 ns 148834 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 149583 ns 104708 ns 1.43
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 147083 ns 153500 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 153917 ns 116979 ns 1.32
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1086649 ns 1086710 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 41913933 ns 41151661 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5704750 ns 5843229.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 620779 ns 594985 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75750 ns 73958 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 78187.5 ns 76791.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 84895.5 ns 80166 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 85458 ns 75417 ns 1.13
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 208740.5 ns 207189 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7587978 ns 7362606 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 522250 ns 519687.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 136042 ns 126391.5 ns 1.08
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 290000 ns 297334 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 286937 ns 221667 ns 1.29
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 302125 ns 288917 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 279416 ns 221041.5 ns 1.26
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1130753 ns 1119401 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43569010 ns 41008184.5 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6840729 ns 6497687.5 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 727137.5 ns 694627 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 17167 ns 16417 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17084 ns 16583 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 17417 ns 17792 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16291 ns 16708 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 148129 ns 147421 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5485127.5 ns 5759467 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 471208 ns 427292 ns 1.10
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 243625 ns 237703 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27270.5 ns 24833.5 ns 1.10
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27333 ns 27042 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26979 ns 27166.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27916.5 ns 27125 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 988689 ns 984196 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 41332749 ns 40719457 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5697083 ns 5828333 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 730974 ns 714022 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11542 ns 11562.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12250 ns 10375 ns 1.18
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11750 ns 12083 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 10583 ns 11083 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 124848 ns 124895.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3576014.5 ns 3575871 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 857292 ns 912833 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 246370.5 ns 242943 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21167 ns 21125 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22562 ns 21917 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22292 ns 22000 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22041 ns 21416 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 707198 ns 706086.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 23127801 ns 21428227.5 ns 1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5332000 ns 5387146 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 697815 ns 673547 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 63500 ns 64000.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 63187.5 ns 63500 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 67604 ns 66166 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66000 ns 62584 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 106420 ns 105629.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3379318 ns 3434086.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1310604 ns 1323250 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 244896 ns 237572 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 438125 ns 448750 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 455375 ns 437958 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 478166 ns 446666 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 451334 ns 449583 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 518775.5 ns 517219 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20304887 ns 21208755 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6140333 ns 5978042 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 757127 ns 730458 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7979 ns 6958.5 ns 1.15
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8041.5 ns 6833 ns 1.18
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7500 ns 8041 ns 0.93
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7604 ns 7771 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 146523.5 ns 145909.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5590898 ns 5602766 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 435708 ns 628395.5 ns 0.69
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 72661 ns 58991 ns 1.23
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13333 ns 14042 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16917 ns 15750 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15270.5 ns 13917 ns 1.10
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15083.5 ns 13479 ns 1.12
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 956333 ns 954313 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 38910703 ns 38432249.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5504417 ns 5549500 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 444671 ns 404584 ns 1.10
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6153583 ns 6160416 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6374041 ns 6378167 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6373813 ns 3224791.5 ns 1.98
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11924250 ns 11924000 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 350545.5 ns 301800.5 ns 1.16
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 305468 ns 294983 ns 1.04
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19103895.5 ns 19104958 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19992375 ns 19957229 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19908604 ns 11123708.5 ns 1.79
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36548541.5 ns 36532604 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1015793.5 ns 1023618 ns 0.99
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1217321 ns 1158122 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 959 ns 917 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1000 ns 958 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1000 ns 958 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1000 ns 959 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23465 ns 23554 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2075509 ns 2143802 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 318917 ns 316188 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 221376 ns 215672 ns 1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3667 ns 3625 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3792 ns 3667 ns 1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3750 ns 3666 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3625 ns 3666 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 279973 ns 283503 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10954168 ns 11257238 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2079042 ns 2086333.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 643436.5 ns 637297 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8000.5 ns 8000 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9292 ns 7958 ns 1.17
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9395.5 ns 9042 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8000.5 ns 7854 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 120927.5 ns 120818.5 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3366103 ns 3517154 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 786604.5 ns 776959 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 72332 ns 67641 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11875 ns 11729.5 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12417 ns 12250 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 13125 ns 12334 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12229.5 ns 12458.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 645035 ns 643501 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 21375165.5 ns 21447178 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 4983459 ns 5189125.5 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388705.5 ns 365334 ns 1.06
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 291 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22899 ns 22596 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2132112.5 ns 1951713 ns 1.09
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 224458 ns 225750 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 53171 ns 52251 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2875 ns 3041 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3042 ns 3208 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3208 ns 3375 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3083 ns 3042 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 204868.5 ns 204741 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 10211479.5 ns 9227567 ns 1.11
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1621667 ns 1619250 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 168474.5 ns 172842 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12042 ns 11250 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11458 ns 11334 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12958 ns 13125 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11375 ns 11458 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 123020.5 ns 121547.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3354460 ns 3353104 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 888334 ns 869041 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 244727 ns 243193 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20625 ns 22000 ns 0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 22875.5 ns 20583 ns 1.11
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23937 ns 21167 ns 1.13
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21395.5 ns 20791 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 601559 ns 598450 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19862650.5 ns 19931223.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4446729 ns 4695229 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 689959 ns 652706.5 ns 1.06
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4458 ns 4417 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4375 ns 4416 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4416 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24235 ns 24359 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2133456 ns 2166080 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 221791 ns 223833 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 58031 ns 52541 ns 1.10
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16500 ns 16667 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16583 ns 16500 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16708 ns 16375 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16417 ns 16333 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 332460.5 ns 331128 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12635929.5 ns 12599810 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1070583 ns 1647875.5 ns 0.65
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 227946 ns 212037.5 ns 1.08
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2083 ns 1959 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2167 ns 2083 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2208 ns 1958 ns 1.13
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2083 ns 1958 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 36306 ns 35684 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1295447 ns 1146851 ns 1.13
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 445333 ns 441458.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 212122.5 ns 206802 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 17479 ns 16645.5 ns 1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 18583 ns 16750 ns 1.11
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17562.5 ns 16562.5 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16375 ns 17208.5 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 295313 ns 294264.5 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 22051076 ns 20813859 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5278167 ns 5292083 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 731771 ns 703797.5 ns 1.04
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59500 ns 59583.5 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 65395.5 ns 63625 ns 1.03
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 66271 ns 62625 ns 1.06
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51083 ns 51292 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66116 ns 66405 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 117509 ns 103511 ns 1.14
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 135271 ns 199395.5 ns 0.68
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 165604.5 ns 157250 ns 1.05
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 124646 ns 133937.5 ns 0.93
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 319416 ns 317729 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 217110 ns 216342 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 617880 ns 579316 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 129792 ns 82458.5 ns 1.57
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 123708 ns 85271 ns 1.45
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 88417 ns 90209 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 128333 ns 140417 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192466 ns 192334 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5745668.5 ns 5533381 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1821459 ns 1893708 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 173229 ns 170101.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1916917 ns 1851687.5 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1914042 ns 1882334 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1900791 ns 1926500 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1910792 ns 1891958.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 534488 ns 532324 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 26461012 ns 25979046 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9244791 ns 9683125 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1111539.5 ns 1080090 ns 1.03
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21568 ns 21761 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2163626 ns 2115738 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 348458 ns 346875 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 48829 ns 45220 ns 1.08
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1750 ns 1.07
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1875 ns 1792 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 252538 ns 253104 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 9959798 ns 9490240.5 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1489771 ns 1088979 ns 1.37
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 206024.5 ns 187502 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9042 ns 8084 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9542 ns 8438 ns 1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11000.5 ns 10875 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 11083 ns 8209 ns 1.35
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 119681.5 ns 119061 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3343318 ns 3459549.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 860042 ns 880209 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 245750 ns 237872 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9125 ns 10167 ns 0.90
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 11104 ns 9208 ns 1.21
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 12270.5 ns 9500 ns 1.29
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9104 ns 9167 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 528697 ns 527070 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19440576 ns 18222497.5 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4311875 ns 4417458 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 666771 ns 634411 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58000 ns 58417 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47084 ns 46333 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46333 ns 39500 ns 1.17
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83584 ns 84083 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40198.5 ns 39770 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1413057.5 ns 1341281.5 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1100291.5 ns 1100583.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79400.5 ns 75935.5 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1925417 ns 1901542 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1984708 ns 1921833.5 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1971416 ns 1955833 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1889167 ns 1881792 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 221269 ns 221320 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34580695 ns 33766076 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11340000 ns 11588792 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1053058 ns 1036440 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 418729 ns 415958 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 419792 ns 420042 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 420833 ns 419875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 417875 ns 418708 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 211247 ns 210156.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7703163.5 ns 7606443 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 523750 ns 522750 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 291621 ns 287858 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 680812.5 ns 764709 ns 0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 671896 ns 781812 ns 0.86
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 675250 ns 753417 ns 0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 792167 ns 678791.5 ns 1.17
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1061378 ns 1059447 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45261299.5 ns 43854665.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6367250 ns 6323063 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 972305 ns 916300 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3461166.5 ns 3425978.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3461270.5 ns 3451792 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3443750 ns 3458979.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3438417 ns 3412708 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 172022 ns 170950 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8433919 ns 8189493 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1372979 ns 1396875 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 422223 ns 435150 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6199166 ns 6194166.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6213333 ns 6230791.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6204084 ns 6222854 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6232604.5 ns 6218875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1006628 ns 1001834 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 51512599 ns 49254606 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7379313 ns 8528604 ns 0.87
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1595282 ns 1556125 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 474750 ns 472667 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 341000 ns 339875 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 340375 ns 253208 ns 1.34
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 903500 ns 902000 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46814 ns 46534 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 863564 ns 886552 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 430166 ns 478875 ns 0.90
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 252863 ns 249963 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2330583 ns 2333750 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2039374.5 ns 2036625 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2036396 ns 1763167 ns 1.15
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3200125 ns 3203312 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 269438 ns 258879 ns 1.04
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 14982086 ns 13032420 ns 1.15
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2160770.5 ns 2178375 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 790237 ns 787818 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57395.5 ns 57542 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46042 ns 45875 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 45959 ns 39458 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82958 ns 83791 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28489 ns 28376 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1441103.5 ns 1391893 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1102062 ns 1124083 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79270.5 ns 77840.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2027000 ns 2032250 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2084854 ns 2093187.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2075437.5 ns 2091917 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1975063 ns 1972229.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 234775 ns 235913 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35760405 ns 35452366 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11437417 ns 11558395.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1093992 ns 1056250.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57416 ns 57708 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46583 ns 46625 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46375 ns 39875 ns 1.16
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83333 ns 83916.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 49617 ns 49455 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 771905.5 ns 809068 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1082958 ns 1084875 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 78170 ns 72105.5 ns 1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1875541 ns 1921083 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1977333 ns 1945916.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1974500 ns 1974729.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1878333 ns 1864791 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 239355 ns 238800.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 16395409.5 ns 17238198 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9631250 ns 10023791.5 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 975732 ns 934629 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 291 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34934 ns 34886 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1184614.5 ns 1200155 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 274562.5 ns 279833 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 51811 ns 48281 ns 1.07
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6708 ns 6792 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7084 ns 6208.5 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7291.5 ns 7000 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7000 ns 6667 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 210987 ns 212384.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 20992732 ns 19751565 ns 1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4248354.5 ns 5078916.5 ns 0.84
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 406385 ns 379104 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32512 ns 32763 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1266942 ns 1167700 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 249625 ns 253542 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 46045.5 ns 41150 ns 1.12
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3000 ns 3833 ns 0.78
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3083 ns 3041 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 3375 ns 0.90
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2875 ns 3125 ns 0.92
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 189582.5 ns 190584.5 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7324802.5 ns 7912209 ns 0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 935896 ns 1265542 ns 0.74
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 180597.5 ns 153656.5 ns 1.18
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 455354.5 ns 454937 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 448250 ns 454750 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 428521 ns 458229 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 445375 ns 427188 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 137896 ns 138010.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5931991 ns 5819207 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1997916 ns 2011000 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 395586 ns 325693 ns 1.21
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3804104 ns 3801708.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3808416 ns 3811125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3806541.5 ns 3821292 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3799792 ns 3815375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 711866.5 ns 710674 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32500206 ns 32043185 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11123208.5 ns 10832625.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1305610 ns 1491590 ns 0.88
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49878875 ns 49856479 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35553750 ns 35516042 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35552083 ns 26022291 ns 1.37
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97154833 ns 97102959 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1599254.5 ns 1594251.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1094408 ns 1009650 ns 1.08
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154338208.5 ns 154623520.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112497125 ns 112350625 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112457084 ns 89065125 ns 1.26
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 295404416.5 ns 296081125 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6447376 ns 6489845.5 ns 0.99
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5620571 ns 5556104 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 19563 ns 17312.5 ns 1.13
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 17833 ns 16834 ns 1.06
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 16833 ns 14291.5 ns 1.18
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15458.5 ns 15167 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 21582 ns 21687 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1105119.5 ns 1157478.5 ns 0.95
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 219625 ns 218167 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 29820 ns 27541 ns 1.08
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10792 ns 11042 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 8958 ns 9000.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9146 ns 7875 ns 1.16
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17479 ns 17416.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 261508.5 ns 261161 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 10504088.5 ns 9552185 ns 1.10
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1527687.5 ns 1560042 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 159993 ns 155181 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8916.5 ns 8125 ns 1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8958 ns 8084 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10541 ns 10083.5 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9084 ns 8542 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 116829.5 ns 116504 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3509968 ns 3349407.5 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 790562.5 ns 798667 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 248174 ns 238952.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9541.5 ns 9854 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9833 ns 10229.5 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 10083 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9604.5 ns 9958 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 626007 ns 623888 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 21525719 ns 22194230 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 4884770.5 ns 4515667 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 702793 ns 656976 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10042 ns 9520.5 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10000 ns 9125 ns 1.10
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11125 ns 11625 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9729.5 ns 9479.5 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 121691.5 ns 120769 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3278367.5 ns 3531092 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 870854 ns 888291 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 79151 ns 79170 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13291 ns 14208 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 14228.5 ns 13208.5 ns 1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 15250 ns 16333 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 15167 ns 17000 ns 0.89
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 595764 ns 594781 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20421555 ns 19851682 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4551104 ns 4474458 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 379417 ns 357348.5 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 583 ns 459 ns 1.27
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 666 ns 458 ns 1.45
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 458 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34966 ns 34855 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1172686 ns 1184802 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 273437.5 ns 423042 ns 0.65
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 209974 ns 209842 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7145.5 ns 7709 ns 0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7625 ns 7084 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8834 ns 7708 ns 1.15
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8396 ns 8042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 232144 ns 231568.5 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 23454434.5 ns 22217593.5 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4465875 ns 5660167 ns 0.79
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 709093 ns 679867 ns 1.04
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16417 ns 16042 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 15417 ns 15333 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 14666 ns 13854 ns 1.06
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10583 ns 10375 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 22370 ns 22215 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1145949.5 ns 1158702.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 198667 ns 205521 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 192503.5 ns 194012 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 31709 ns 31958 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32250 ns 32145.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32250 ns 32250 ns 1
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 31750 ns 32250 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 275284 ns 276502.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12020472 ns 11085623 ns 1.08
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1736729.5 ns 1721729 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 607877 ns 605276.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 440083 ns 474834 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 450750 ns 445167 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 445500 ns 486875 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 443521 ns 474916 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194554 ns 194410 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6201210 ns 5748288 ns 1.08
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1957625 ns 2751937.5 ns 0.71
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 370197 ns 326354 ns 1.13
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3828541 ns 3823792 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3823666 ns 3824042 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3797896 ns 3849500 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3830916 ns 3847584 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 542025 ns 546410 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27662239 ns 27926309 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9597062.5 ns 10140750 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1406828 ns 1388348.5 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 835622667 ns 782652917 ns 1.07
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 542659916 ns 542161792 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 543414375 ns 420966458.5 ns 1.29
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1511375063 ns 1553203729.5 ns 0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22747194 ns 22558411.5 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14291922 ns 14062784.5 ns 1.02
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 3023342458 ns 2518008250 ns 1.20
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1783442458 ns 1785714792 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1798462750 ns 1525039667 ns 1.18
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4766247958 ns 4874366334 ns 0.98
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 367462031 ns 367235490 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88049233 ns 88231178 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76166.5 ns 77646 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 77000 ns 75959 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 79625 ns 82625 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77458 ns 77291 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 213561 ns 208602.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7926068 ns 8336540 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 520125 ns 525229 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 113802.5 ns 109211 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 203959 ns 199042 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 282979 ns 262396 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 195667 ns 276625 ns 0.71
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 203125 ns 287458 ns 0.71
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1066917 ns 1056833 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42732466 ns 40754174 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6048208 ns 6090583 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 677885 ns 646691 ns 1.05
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199952791.5 ns 199913000 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139336834 ns 139280375 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139226875 ns 104140916 ns 1.34
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389244000 ns 389020708 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5812725 ns 5827400 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3450504 ns 3419864.5 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 619039062 ns 620313062.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439416041 ns 440225000 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 438636083.5 ns 352767458 ns 1.24
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1188607167 ns 1182963541 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26431639 ns 26862507 ns 0.98
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21894468 ns 21755438 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7292 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6250 ns 6083 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6083 ns 5291 ns 1.15
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9959 ns 10041 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28527 ns 28028 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1247952 ns 1272660 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 430792 ns 627458 ns 0.69
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 53561 ns 48010 ns 1.12
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 243708 ns 220750 ns 1.10
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220250 ns 220521 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222125 ns 221875 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207959 ns 209208.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 225129 ns 222206 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 30977876 ns 29719216 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9232312.5 ns 9434666.5 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 578042 ns 527475 ns 1.10
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9396 ns 8458.5 ns 1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9500 ns 9209 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10771 ns 10375 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9583 ns 8083 ns 1.19
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 121662 ns 119377.5 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3360687 ns 3449983 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 875708 ns 855000 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 75411 ns 72520 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7208 ns 8958.5 ns 0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8125 ns 7500 ns 1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9584 ns 10084 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8917 ns 10187.5 ns 0.88
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 529854.5 ns 521950 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19211637 ns 18008002 ns 1.07
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4408249.5 ns 4315292 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 346897 ns 321943 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 750 ns 625 ns 1.20
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 26993 ns 26701 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1174455.5 ns 1195571.5 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 467146 ns 459104 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 52391 ns 48701 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8584 ns 10375 ns 0.83
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8792 ns 8479 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10834 ns 11375 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15375 ns 9375 ns 1.64
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 257305 ns 252977 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 23468849 ns 24052360 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5677333.5 ns 5702709 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 442295 ns 397983.5 ns 1.11
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 106875 ns 106500 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 99584 ns 98125 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 100792 ns 87479.5 ns 1.15
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146750 ns 147229 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 25363 ns 24863 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 1177107 ns 1228355 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 262833 ns 263458.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 194834 ns 190212 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 478584 ns 478667 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 492000 ns 509250 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 483042 ns 518562.5 ns 0.93
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 516875 ns 520417 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 236865 ns 234381 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11915915 ns 11772054 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2088333 ns 2148312.5 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 625564.5 ns 621156 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 7583 ns 5375 ns 1.41
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5833 ns 5167 ns 1.13
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7458 ns 7500 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4562.5 ns 4833.5 ns 0.94
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16602 ns 16136 ns 1.03
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 79982 ns 79061 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 11750 ns 14083 ns 0.83
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 11208.5 ns 10208.5 ns 1.10
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11750 ns 10292 ns 1.14
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17458.5 ns 16708 ns 1.04
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 216645 ns 213958 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 391539 ns 374963 ns 1.04
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 39458 ns 40000 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51292 ns 50584 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52750 ns 52458.5 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13792 ns 13895.5 ns 0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA 21950 ns 19866 ns 1.10
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 94612 ns 87035.5 ns 1.09
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 42187.5 ns 38625 ns 1.09
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 32750 ns 30646 ns 1.07
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 33000 ns 30791.5 ns 1.07
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 58271 ns 57666 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 195696 ns 192524 ns 1.02
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 447525.5 ns 416745 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1583.5 ns 1604.5 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1917 ns 1791 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2333 ns 2042 ns 1.14
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1812.5 ns 1708 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 21498 ns 21123 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1151336 ns 1140764 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 293083.5 ns 294500 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 38250.5 ns 30391 ns 1.26
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2042 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2312.5 ns 2125 ns 1.09
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2292 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2208 ns 2208 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 206694 ns 205122.5 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 8887387.5 ns 8519681 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1497979 ns 1638500 ns 0.91
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 147833 ns 139726.5 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5291.5 ns 5709 ns 0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5250 ns 5104 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6791.5 ns 5750 ns 1.18
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5562.5 ns 4271 ns 1.30
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 148541 ns 146388.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5839841.5 ns 5488369.5 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 451916 ns 465291 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 75451 ns 72161 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8541.5 ns 8479.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8709 ns 8209 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9334 ns 8750 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9000 ns 9042 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 897209 ns 884256.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 39004376 ns 38177021 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5527666 ns 5496125 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 417789 ns 394569 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56875 ns 56791 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57750 ns 57625 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57542 ns 56875 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57916 ns 58166 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 38278 ns 37427.5 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1212677.5 ns 1210467.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 466125 ns 468667 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 208395 ns 208482 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 451167 ns 487354.5 ns 0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 477500 ns 501250 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 469250.5 ns 492208.5 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 444875 ns 437438 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 271103 ns 267413 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26939835 ns 26782051.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8197917 ns 8248375 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 826719.5 ns 839679 ns 0.98
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3309791 ns 3311333.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2333708 ns 2340166.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2341958 ns 1769958 ns 1.32
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6325125 ns 6319645.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 206333.5 ns 205610 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 225750.5 ns 202712 ns 1.11
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11474875 ns 11497979 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8336979 ns 8319667 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8319292 ns 6588125 ns 1.26
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21266187.5 ns 21221896 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 735499 ns 736463 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1123791.5 ns 1065445 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7209 ns 5562.5 ns 1.30
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5145.5 ns 4666.5 ns 1.10
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6542 ns 6437.5 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6792 ns 6104 ns 1.11
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 141393 ns 139569.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5544700.5 ns 5734965.5 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 751271 ns 826042 ns 0.91
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 61961 ns 59531 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 9333.5 ns 0.80
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7000 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10417 ns 11875 ns 0.88
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10458 ns 8708 ns 1.20
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 772087 ns 764194 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 34449457 ns 34028843.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5194250 ns 5176312.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 413259.5 ns 378403 ns 1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 101625 ns 99625 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 116167 ns 136708 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 97521 ns 101312.5 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 129583 ns 129709 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 152327.5 ns 151420 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5751979.5 ns 6034399 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2023792 ns 1982667 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 209935 ns 206692 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2000458 ns 2031041 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2039084 ns 2037417 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2004708 ns 2036291 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2042146 ns 2038584 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 718327 ns 708221 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31776627 ns 31488037 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11003520.5 ns 11251291 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1109931 ns 1126246 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33688 ns 33459 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36229.5 ns 36750 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 36000 ns 33833 ns 1.06
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 708 ns 667 ns 1.06
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15790 ns 15506 ns 1.02
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 93042 ns 86920 ns 1.07
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2625 ns 4792 ns 0.55
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2958 ns 2709 ns 1.09
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3583 ns 3167 ns 1.13
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2979.5 ns 2291.5 ns 1.30
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 143400.5 ns 140769.5 ns 1.02
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 389544.5 ns 351474 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7250 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6041 ns 6000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5708 ns 5375 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10041 ns 10000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37768.5 ns 36795 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1165405 ns 1247042.5 ns 0.93
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 353500 ns 351333 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 52952 ns 49030 ns 1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213770.5 ns 213334 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220312.5 ns 220166.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222750 ns 228125 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207646 ns 206875 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 248646 ns 244945 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26791984 ns 24969632 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7867770.5 ns 7965166.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 615929.5 ns 578090.5 ns 1.07
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3916 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3959 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22442 ns 21762 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2173165.5 ns 2067928.5 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 240291 ns 245104 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 48101 ns 45631 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14834 ns 14875 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15000 ns 14916 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14917 ns 14667 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14666 ns 14667 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 315075 ns 310256.5 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11343180 ns 11269459 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1001479.5 ns 1000292 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 210165 ns 193502 ns 1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 135792 ns 102917 ns 1.32
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 104291 ns 103667 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 105021 ns 108625 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 136667 ns 131875 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 146027 ns 137366.5 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6157059 ns 5955500.5 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1925791 ns 1988958 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 190028 ns 200842 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1910209 ns 1926354.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1920083 ns 1913500 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1909604.5 ns 1917792 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1930625 ns 1936729 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 700646 ns 692519 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 30420268 ns 33116808.5 ns 0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10973125 ns 11144584 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1244371 ns 1078360.5 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17833 ns 17708 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18146 ns 22291.5 ns 0.81
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19834 ns 21250 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19958 ns 19146 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 111591.5 ns 109241 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3532207.5 ns 3392625.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1347125 ns 1271125 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 87019 ns 81331 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217604.5 ns 221229.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228270.5 ns 216791 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218417 ns 230083.5 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215833 ns 216083.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 530706 ns 522920 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20900864.5 ns 19545470 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6137312.5 ns 6165645.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 517847 ns 476780 ns 1.09
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 27333 ns 26250 ns 1.04
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 30104.5 ns 31250 ns 0.96
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 30000 ns 27875 ns 1.08
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1833 ns 1292 ns 1.42
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16696 ns 16312 ns 1.02
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 95970 ns 87751 ns 1.09
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4375 ns 6625 ns 0.66
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4895.5 ns 4645.5 ns 1.05
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 6000 ns 4917 ns 1.22
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4834 ns 4792 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 210645.5 ns 207882.5 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 420858 ns 402074 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 306167 ns 305938 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305959 ns 305917 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 307583 ns 307521 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 305542 ns 305375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 233739 ns 230214 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7904754 ns 7500239 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 570771 ns 643000 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 284229 ns 280903 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 581667 ns 538541 ns 1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 535791 ns 549750 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 534333 ns 542666 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 534959 ns 529708 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1101751 ns 1085631 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 46425480.5 ns 44253871 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6055125 ns 6154687.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 916717 ns 872599 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19167 ns 19021 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20396 ns 19833.5 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22375 ns 22542 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20708 ns 21917 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 116012 ns 114174 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3553813 ns 3531348.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1402542 ns 1449271 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 86330 ns 81471 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217375 ns 218834 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219625 ns 227542 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214250 ns 219708 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212500 ns 212708 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 762109 ns 761865.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24400401 ns 24050167 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7404417 ns 7412916.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 578169 ns 543136 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7500 ns 7125.5 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7041.5 ns 6479 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7833 ns 8458 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7250 ns 6084 ns 1.19
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 143913.5 ns 141785 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5579609.5 ns 5370056 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 787333.5 ns 777458 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 71680 ns 69581 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10541 ns 12958 ns 0.81
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10645.5 ns 9583.5 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10458 ns 10687.5 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10167 ns 9625 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 845090 ns 832452.5 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 38996778 ns 38810557 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5177833 ns 5231375 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 431434.5 ns 395184 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5375 ns 5145.5 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5604.5 ns 4812.5 ns 1.16
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6667 ns 6958 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6250 ns 6833 ns 0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 147643 ns 144967.5 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5420881 ns 5514807.5 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 770708 ns 829125 ns 0.93
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 64090 ns 70250 ns 0.91
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7667 ns 7770.5 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7834 ns 7333 ns 1.07
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8000 ns 7667 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7625 ns 7208 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 801461 ns 790491 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 38866348 ns 37869840 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5549500 ns 5670687 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 434220 ns 398424.5 ns 1.09
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14438042 ns 14518959 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10127917 ns 10120000 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10159833 ns 7708791.5 ns 1.32
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27954459 ns 27832250 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 533907.5 ns 532409 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 478160.5 ns 399949.5 ns 1.20
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46256583.5 ns 46375083.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33464500 ns 33404583.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33570375 ns 26627416.5 ns 1.26
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85876709 ns 85835750 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2658893 ns 2644453 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3344856 ns 3278895 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 66792 ns 66042 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 65979.5 ns 66125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 68375 ns 70520.5 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66750 ns 67875 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 123457.5 ns 119873.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3450048 ns 3330724 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1415271 ns 1410021 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 238451 ns 229907.5 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 439917 ns 453292 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 473125 ns 441208 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 442479 ns 450208 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 441875 ns 445541 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 741057 ns 732886.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27244548.5 ns 26274297 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7814416.5 ns 7781500 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 827592 ns 794638 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 33032 ns 32132 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1140283.5 ns 1164338 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 439500 ns 431645.5 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 51400 ns 49160 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9125 ns 8292 ns 1.10
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9167 ns 8708 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9667 ns 9250 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9167 ns 8959 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 290866 ns 286401.5 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 29330832.5 ns 21940598 ns 1.34
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5094625 ns 5096125 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 423181 ns 388934 ns 1.09
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9833 ns 9792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9833 ns 9875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9833 ns 9833 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9792 ns 9875 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23856 ns 23178 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2133194 ns 1908743.5 ns 1.12
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 221187.5 ns 222541 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 221570.5 ns 217383 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 46084 ns 45875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 46209 ns 45917 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46708 ns 46167 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45833 ns 45875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 295006 ns 293089 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12015326 ns 10988297.5 ns 1.09
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 1410479 ns 982875 ns 1.44
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 632758 ns 621107 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56208 ns 56250 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57167 ns 57125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57208 ns 56334 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57916 ns 57792 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 29619 ns 28527 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1174842 ns 1186883 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 579563 ns 578645.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 208291 ns 204943 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 448604.5 ns 448333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 467334 ns 494125 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 465499.5 ns 507583 ns 0.92
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 442312 ns 439437 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 251481 ns 247232 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32120348 ns 33216066 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9690584 ns 9499166 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 916830 ns 891519.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 644958 ns 652937.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645875 ns 647333 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 633021 ns 662854 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 658291.5 ns 668500 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 213385 ns 207996 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8467842 ns 8125052.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1345291 ns 1384354.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 276851.5 ns 233282 ns 1.19
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2217750 ns 2235042 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2230604 ns 2238979 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2220792 ns 2248959 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2276833.5 ns 2260792 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 999797 ns 984096 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 48712510 ns 45382984 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9179667 ns 8132833.5 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1474035 ns 1370494 ns 1.08
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19709 ns 20958 ns 0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19750 ns 20000 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22833.5 ns 22667 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21833 ns 22083 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 114451.5 ns 113160 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3653619 ns 3278898 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1435020.5 ns 1472792 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 86210.5 ns 81561 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220750 ns 222313 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 256917 ns 257542 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220958 ns 232250 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220125 ns 228000.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 740583 ns 734156.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24806004 ns 27357269 ns 0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7609833 ns 7692750 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 592134 ns 559476 ns 1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 541 ns 1.16
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23545 ns 23248 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1179342 ns 1222462 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 451125 ns 466625 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 52320 ns 51870 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9708 ns 9167 ns 1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9542 ns 9208 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10271 ns 9292 ns 1.11
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10042 ns 9312.5 ns 1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 271980 ns 268568 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 23890621 ns 24289416 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5928209 ns 6049709 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 462544 ns 410500 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8458 ns 10333 ns 0.82
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9333 ns 8458 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11333 ns 10354 ns 1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10625 ns 8333 ns 1.28
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 122249.5 ns 120393.5 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3377288.5 ns 3445203 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 857708 ns 832874.5 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 78766 ns 72921 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7479 ns 7583 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8208 ns 8208 ns 1
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7875 ns 7417 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7458.5 ns 7770.5 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 519030.5 ns 511772 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17279332 ns 16339001 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4021166 ns 3959271 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 354777.5 ns 328364 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1709 ns 1458 ns 1.17
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 1542 ns 1.19
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1958 ns 1833 ns 1.07
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1459 ns 1541 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 22193 ns 21725 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1121843 ns 1136020 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 293083 ns 296000 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 196742 ns 194712 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3292 ns 3250 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3458 ns 3250 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3583 ns 3500 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3250 ns 3209 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 222520.5 ns 220221.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 9986789 ns 9698879 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1730167 ns 1612667 ns 1.07
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 596665 ns 596166 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 147979 ns 148167 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 128500 ns 127709 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 129874.5 ns 107958.5 ns 1.20
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 226000 ns 225958 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 24573 ns 24338 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1208173 ns 1138772 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 293667 ns 270854.5 ns 1.08
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 40630 ns 40151 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 142979.5 ns 156125 ns 0.92
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 110791 ns 127209 ns 0.87
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 110604.5 ns 100750 ns 1.10
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 252042 ns 256666.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 220880.5 ns 218905 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10430562 ns 10030041 ns 1.04
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 1988667 ns 2003417 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 256232 ns 240417.5 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7292 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 6083 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6041 ns 5375 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10375 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33867 ns 32865 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1251518.5 ns 1134920.5 ns 1.10
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 348625 ns 562875 ns 0.62
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 54871 ns 52191 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 246687.5 ns 230854.5 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 263750 ns 270500 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228041 ns 264875 ns 0.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214750.5 ns 213771 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 267151.5 ns 263381.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27588902 ns 28212764 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8321542 ns 8517000 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 575196 ns 607266 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 15875 ns 14958 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15208 ns 15500 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 16937.5 ns 16500 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 14959 ns 15625 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 143325 ns 140749.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5498476 ns 5465169 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 761333 ns 787125 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 239812 ns 238512 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24000 ns 22583 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23750 ns 23500 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24437.5 ns 24084 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23666 ns 23167 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 885563.5 ns 875101 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 39845026.5 ns 37582744 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5521500 ns 5600270.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 720238 ns 692048 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9312.5 ns 9125 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10125 ns 9250.5 ns 1.09
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11229 ns 10521 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9833 ns 9209 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 126504.5 ns 124561 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3392484.5 ns 3393331 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 805521 ns 802083 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 85091 ns 79030 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13375 ns 13750 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14125 ns 14125 ns 1
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14167 ns 14125 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14042 ns 13917 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 680177 ns 670894 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 21529605 ns 20295661 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5139312.5 ns 5274042 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 404764 ns 375405 ns 1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9875 ns 9208.5 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9625 ns 9167 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10770.5 ns 10438 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10291 ns 9584 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 125014 ns 122339.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3361307 ns 3319433.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 886542 ns 882875 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 80481 ns 75581 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12417 ns 12333.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13083.5 ns 12645.5 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13041.5 ns 12708 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12333 ns 12708 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 563574.5 ns 557225 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 18790951 ns 18661226 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4467334 ns 4435167 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 372394 ns 345844 ns 1.08
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 30292 ns 30292 ns 1
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 34479.5 ns 34021.5 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 31625 ns 30854.5 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1813 ns 1791 ns 1.01
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16646 ns 16303 ns 1.02
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 89151 ns 82211 ns 1.08
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5396 ns 5270.5 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5229 ns 5354 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5500 ns 5375 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6583 ns 6625 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 143042 ns 140733 ns 1.02
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 400755 ns 394064.5 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 26809 ns 26135 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1181881 ns 1123770.5 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 443542 ns 474625 ns 0.93
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 50281 ns 50311 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6708 ns 6375 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6667 ns 6145.5 ns 1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7000 ns 6416 ns 1.09
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6500 ns 6416 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 190632 ns 187828 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 23906222 ns 23626156 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5728875 ns 5544437.5 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 430065 ns 395104 ns 1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 1959 ns 2042 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2083 ns 2000 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2083 ns 1959 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 1958 ns 2000 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 27026 ns 26544 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1206962 ns 1165809 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 452063 ns 461708.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 211793 ns 209972 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16125 ns 15792 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17333 ns 16375 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17125 ns 17000 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16791.5 ns 16084 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 279731.5 ns 275962 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24291591 ns 24890960.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5504875.5 ns 5972833 ns 0.92
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 753090 ns 713667.5 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 151625 ns 178250 ns 0.85
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 151333.5 ns 184187.5 ns 0.82
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151250 ns 153417 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 153000 ns 147459 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 210001.5 ns 204372 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7845014 ns 7857309.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1393000 ns 1392667 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 239913 ns 196752 ns 1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1290166 ns 1326895.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1317125 ns 1320625 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1254583 ns 1330833 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1341125 ns 1334750 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 935505 ns 917280 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 46976250.5 ns 46023181 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6512417 ns 6714958.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1051794 ns 1108992 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25188 ns 25229.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25833 ns 26583 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27084 ns 26833 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24375 ns 25917 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 241581 ns 239791.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7794179 ns 7972748 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 632583.5 ns 980542 ns 0.65
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 120932 ns 116941 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 118854.5 ns 179917 ns 0.66
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 117645.5 ns 141604.5 ns 0.83
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 119250 ns 127354.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 118000 ns 118604 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1108020 ns 1092585 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 48152430 ns 43816902.5 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6196979.5 ns 6033333 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 640758 ns 606086 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 333 ns 291 ns 1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 334 ns 334 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23090 ns 22970 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1208429 ns 1175116 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 442500 ns 456125 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 51371 ns 48591 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6437.5 ns 6625 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6667 ns 6750 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7062.5 ns 6542 ns 1.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6417 ns 6459 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 208209 ns 204628 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 23792216 ns 23603781 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5691062.5 ns 6092458 ns 0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 437776 ns 397554 ns 1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6791 ns 6125 ns 1.11
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6500 ns 6334 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7979 ns 6709 ns 1.19
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6000 ns 5937.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 149235 ns 147027 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5562220 ns 5559804 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 701208.5 ns 583167 ns 1.20
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 243743.5 ns 237472 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10208.5 ns 9666.5 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10542 ns 10041 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 10041 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9750 ns 9854 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 922802 ns 910526.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 40299139 ns 39406121 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5880937.5 ns 5909375 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 710000 ns 686288 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 708 ns 666 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23085 ns 22655 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2118566 ns 2037996 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 222959 ns 222583 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 219753 ns 215862 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4584 ns 4584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4834 ns 4584 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4875 ns 4625 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4625 ns 4625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 233956.5 ns 232442.5 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10123278 ns 9881227 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1643667 ns 1690521 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 602789 ns 600181 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8083.5 ns 8562.5 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8791 ns 7937.5 ns 1.11
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9937.5 ns 9771 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8000 ns 8520.5 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 124933.5 ns 122197 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 4600589 ns 3361719 ns 1.37
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 757833.5 ns 761542 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 86781 ns 76241 ns 1.14
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8646 ns 8792 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8958 ns 8459 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8604.5 ns 8875 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8812.5 ns 8750 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 604520 ns 595652 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 20356936 ns 20278296 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4765146 ns 4718125 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 383405 ns 354274 ns 1.08
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 128437.5 ns 125917 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 129625 ns 128958 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 130042 ns 96959 ns 1.34
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 182834 ns 181416 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46497 ns 46106 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 103662 ns 96666 ns 1.07
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 302563 ns 317875 ns 0.95
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 327249.5 ns 346375 ns 0.94
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 313708 ns 178979 ns 1.75
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 603874.5 ns 569062.5 ns 1.06
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 195592 ns 191966 ns 1.02
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 538948 ns 487875 ns 1.10
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397167 ns 397125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288750 ns 288292 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288375 ns 215791 ns 1.34
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 757125 ns 757959 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44085 ns 43243.5 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1443281 ns 1345812 ns 1.07
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 421187.5 ns 404062.5 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 87081 ns 83381 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1454396 ns 1459854 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1135375 ns 1136645.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1132041.5 ns 865270.5 ns 1.31
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2362187.5 ns 2359813 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 253149 ns 259216 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 11724690 ns 11177773 ns 1.05
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1784562.5 ns 1833666 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 361220.5 ns 349653.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 657875 ns 642333 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 636062.5 ns 649875 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 645333.5 ns 660416.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 659250 ns 623542 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 206301 ns 202604 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8383365 ns 7957177 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1348479 ns 1348791.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 294724 ns 265108 ns 1.11
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2436771 ns 2448583 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2445750 ns 2452104 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2450125 ns 2473833 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2475125 ns 2455791 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1021725.5 ns 1005284.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 53051132.5 ns 50767854.5 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9126083 ns 10026166 ns 0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1419802 ns 1511186 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33812.5 ns 32375 ns 1.04
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35417 ns 35749.5 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34708.5 ns 34312.5 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 833 ns 916 ns 0.91
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16136 ns 15700 ns 1.03
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 95052 ns 81140 ns 1.17
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3083 ns 3166 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3333 ns 3083 ns 1.08
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3333 ns 3125 ns 1.07
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3167 ns 3000 ns 1.06
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 142067 ns 139352.5 ns 1.02
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 389191.5 ns 344664 ns 1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 406166 ns 405583 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 408167 ns 408750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 407833 ns 403083 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 422208 ns 422042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 43553 ns 43343.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1368822.5 ns 1354478 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1102709 ns 1109583 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 245914 ns 240442 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3863750 ns 3869125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3994375.5 ns 3994396 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3960812 ns 3999708 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3790666.5 ns 3774354.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 247777 ns 244251 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 39920104.5 ns 35978667 ns 1.11
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11733541 ns 11608750 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1266155.5 ns 1245273.5 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3958 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34512 ns 34866 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1230578 ns 1227111 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 176084 ns 175291 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 43730 ns 42710 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15833 ns 15750 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 16042 ns 15667 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 16000 ns 15500 ns 1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15417 ns 15542 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 258709.5 ns 256386 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8846251 ns 8908913 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 862749.5 ns 872958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 194923 ns 174412 ns 1.12
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404209 ns 404166 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295750 ns 295666 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295459 ns 221625 ns 1.33
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 761083 ns 760500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113512 ns 113218 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1027530 ns 1016425 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 395541 ns 393437 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 92702 ns 90851 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1482166 ns 1473333 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1160146 ns 1161666 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1157000 ns 888166.5 ns 1.30
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2385416 ns 2383791 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 244710.5 ns 241468.5 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 11288335 ns 11846004 ns 0.95
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1872833 ns 1877938 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 361906 ns 360704 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 583 ns 583 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 584 ns 459 ns 1.27
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 459 ns 542 ns 0.85
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 26536 ns 25943 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1153344.5 ns 1192515 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 444792 ns 470937.5 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 213574 ns 208143 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7458 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7791 ns 7583 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8083 ns 7458 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7500 ns 7709 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 215961 ns 214477.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25096072 ns 25777295.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5807500 ns 5998979.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 743182 ns 700287 ns 1.06
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 833166.5 ns 831271 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 620875 ns 617041 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 619375 ns 470000 ns 1.32
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1570729.5 ns 1545709 ns 1.02
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129541 ns 129860.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 179493 ns 169171.5 ns 1.06
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2687271.5 ns 2689145.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2002417 ns 2013250 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2005437.5 ns 1538125 ns 1.30
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4951500 ns 4941375 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 258930 ns 241461 ns 1.07
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 923056 ns 867019 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 291 ns 1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32385 ns 31985 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1154161 ns 1142400.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 273792 ns 453291.5 ns 0.60
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 51031 ns 48580 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6417 ns 6250 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6750 ns 6375 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6875 ns 6416 ns 1.07
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6166 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 229170 ns 224593 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21853717 ns 21127237.5 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5377542 ns 5053916 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 408297 ns 372504 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2397334 ns 2423917 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2395792 ns 2397291.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2399146 ns 2403792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2411125 ns 2371125 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 204989 ns 203214 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7992865.5 ns 8123069 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1435875 ns 1393562 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 384237 ns 332763.5 ns 1.15
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4636500 ns 4645250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4665291.5 ns 4645125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4661083 ns 4654250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4655083 ns 4658042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 927269 ns 910071 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 45794637 ns 48057492 ns 0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6682583.5 ns 6619584 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1310394 ns 1416215 ns 0.93
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 7479.5 ns 7438 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7500 ns 7083 ns 1.06
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7542 ns 6958 ns 1.08
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6708 ns 6979 ns 0.96
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 23953 ns 23722 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1173979.5 ns 1176238 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 270854 ns 263000 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 42035.5 ns 34150 ns 1.23
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 67667 ns 68020.5 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 47812.5 ns 50312 ns 0.95
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 33917 ns 53292 ns 0.64
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 32979.5 ns 32583 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 222148 ns 218170 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10439799.5 ns 10824043 ns 0.96
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 1991792 ns 2030958 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 267010 ns 244333 ns 1.09
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21770.5 ns 21437 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 26438 ns 25333 ns 1.04
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 24959 ns 23479.5 ns 1.06
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5458 ns 6083 ns 0.90
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17080.5 ns 16786.5 ns 1.02
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 83551 ns 91501 ns 0.91
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11958.5 ns 12208.5 ns 0.98
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10958 ns 10083 ns 1.09
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10583 ns 9458.5 ns 1.12
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17854.5 ns 17854.5 ns 1
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 230933 ns 228126 ns 1.01
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 405258 ns 376824 ns 1.08
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406187.5 ns 406500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297166 ns 297312.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 297187.5 ns 223791 ns 1.33
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762917 ns 762958 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46934 ns 46683 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1403333 ns 1412498.5 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 411187.5 ns 476666.5 ns 0.86
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 92831 ns 89121 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1505167 ns 1499875 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1167833 ns 1167833.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1168625 ns 894271 ns 1.31
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2388625 ns 2389834 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 288977 ns 292932.5 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 14170527.5 ns 13048501 ns 1.09
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2071500 ns 2098166 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 384147 ns 380285 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 433875 ns 433875 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 437208 ns 436334 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 436750 ns 430709 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 448250 ns 448020.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 55350 ns 54564 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1004353.5 ns 1024914 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1090667 ns 1099208.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 239614 ns 236522.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3885209 ns 3897208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4024833 ns 4021833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3930770.5 ns 4027708 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3816104 ns 3812146 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 268116 ns 264154 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 30218847.5 ns 31494055 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10492979.5 ns 10517749.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1300253 ns 1245028 ns 1.04
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8791 ns 8750 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 7667 ns 7666 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7667 ns 6834 ns 1.12
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12416 ns 12459 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24893 ns 24707 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2144175 ns 2085760.5 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 220584 ns 225250 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 221494 ns 215337.5 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45125 ns 45042 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45125 ns 45125 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45167 ns 45083 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45020.5 ns 45187.5 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 350730 ns 350283.5 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 13707515 ns 11134325 ns 1.23
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1817750 ns 1805125 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 670762.5 ns 662902 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 105937.5 ns 93959 ns 1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 84541 ns 129416 ns 0.65
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 88500 ns 87916.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 121479 ns 125062.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 189955 ns 189645 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5812012.5 ns 5972246.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2722250 ns 1906021.5 ns 1.43
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 187524 ns 201947 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2003958 ns 2011375 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2016666.5 ns 2017791 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1982645.5 ns 2029459 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2047125 ns 2017916.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 543645 ns 537811 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28323206 ns 27667805 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9581833.5 ns 9734479.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 990259 ns 1103102 ns 0.90

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.