Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
test: add tests comparing the fused op with unfused op
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 10, 2024
1 parent 40d9192 commit 897d842
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.2.0"
version = "1.2.1-DEV"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
27 changes: 21 additions & 6 deletions test/common_ops/dense_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ using LuxLib, LuxTestUtils, Random, Test, Zygote, NNlib, StableRNGs

anonact = x -> x^3

dense_simple(act, w, x, ::Nothing) = act.(w * x)
dense_simple(act, w, x, b) = act.(w * x .+ b)

function run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu)
rng = StableRNG(1234)

Expand Down Expand Up @@ -44,6 +47,20 @@ function run_dense_testing(Tw, Tx, M, N, hasbias, activation, aType, mode, ongpu
(w, x, b) -> __f(activation, w, x, b)
end
test_gradients(__f_grad, w, x, bias; atol, rtol, skip_backends, soft_fail=fp16)

y_simple = dense_simple(activation, w, x, bias)
y_zyg = fused_dense_bias_activation(activation, w, x, bias)
@test y_simpley_zyg atol=atol rtol=rtol

_, ∂w_true, ∂x_true, ∂b_true = Zygote.gradient(
sum dense_simple, activation, w, x, bias)
_, ∂w_zyg, ∂x_zyg, ∂b_zyg = Zygote.gradient(
sum fused_dense_bias_activation, activation, w, x, bias)
@test ∂w_true∂w_zyg atol=atol rtol=rtol
@test ∂x_true∂x_zyg atol=atol rtol=rtol
if bias !== nothing
@test ∂b_true∂b_zyg atol=atol rtol=rtol
end
end

const ALL_TEST_CONFIGS = Iterators.product(
Expand Down Expand Up @@ -149,14 +166,12 @@ end
@testitem "Enzyme.Forward patch: dense" tags=[:dense] setup=[SharedTestSetup] skip=:(using LuxTestUtils; !LuxTestUtils.ENZYME_TESTING_ENABLED) begin
using LuxLib, Random, LuxTestUtils, Enzyme

if LuxTestUtils.ENZYME_TESTING_ENABLED
x = rand(Float32, 2, 2)
x = rand(Float32, 2, 2)

f(x) = sum(abs2, LuxLib.Impl.matmul(x, x))
f(x) = sum(abs2, LuxLib.Impl.matmul(x, x))

# Just test that we don't crash
@test length(Enzyme.gradient(Forward, f, x)) == 4
end
# Just test that we don't crash
@test length(Enzyme.gradient(Forward, f, x)) == 4
end

@testitem "Enzyme rules for fused dense" tags=[:dense] setup=[SharedTestSetup] skip=:(using LuxTestUtils; !LuxTestUtils.ENZYME_TESTING_ENABLED) begin
Expand Down

1 comment on commit 897d842

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 897d842 Previous: 40d9192 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5312.5 ns 5666 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7792 ns 7459 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8000 ns 8458 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6958.5 ns 7291 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 119033 ns 119078 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2748386 ns 2538616 ns 1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 825375 ns 702792 ns 1.17
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 401934 ns 427074 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9583 ns 10020.5 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9875 ns 9750 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9875 ns 10250 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9979 ns 9895.5 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 554263 ns 551531 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18522254 ns 18148603 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2713291 ns 2222000 ns 1.22
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 671997 ns 679576 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 7645.5 ns 1271 ns 6.02
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 7500 ns 2729 ns 2.75
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 9750 ns 1708.5 ns 5.71
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 8521 ns 1708.5 ns 4.99
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 23694 ns 21712 ns 1.09
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1287189.5 ns 1291875 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 222062.5 ns 183666 ns 1.21
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 31840 ns 31345.5 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4770.5 ns 3500 ns 1.36
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 5041 ns 3333 ns 1.51
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 5583.5 ns 4208.5 ns 1.33
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 5062 ns 4375 ns 1.16
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 145766 ns 146456.5 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 8553462 ns 8037303.5 ns 1.06
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1568604.5 ns 1510917 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 146901 ns 146682 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56917 ns 56500 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47083 ns 46875 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47375 ns 46833 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82792 ns 83459 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 39154 ns 36990 ns 1.06
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 578385 ns 664843 ns 0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1060708 ns 1340625 ns 0.79
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 81970 ns 80736 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2023687.5 ns 2031000 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2084333.5 ns 2086333.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2097166.5 ns 2089292 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1996667 ns 1995354 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 220055 ns 232927.5 ns 0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7652457 ns 7734526 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 5389292 ns 4323958 ns 1.25
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1353254 ns 1581446 ns 0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 147146 ns 147042 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 149750 ns 144625 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 146270.5 ns 149833 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 150667 ns 151895.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165828.5 ns 166087 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7443338.5 ns 7754863 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1542042 ns 1479250 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 204932 ns 198942 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1114916 ns 1120063 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1110250 ns 1117666 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1120437.5 ns 1115750 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1114709 ns 1124875 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 688383 ns 721156.5 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33804204 ns 33562933.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6685792 ns 6149062.5 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1030010.5 ns 1022579 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4479 ns 4166 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5417 ns 5041.5 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5583 ns 6042 ns 0.92
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4334 ns 6250 ns 0.69
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 91302 ns 95202.5 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5274846.5 ns 5313078 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 449229 ns 416333.5 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 69581 ns 65661 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8458 ns 9000 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8625 ns 8709 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9292 ns 9375 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8417 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 588432.5 ns 618225 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 35019318 ns 31699887 ns 1.10
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6040187 ns 5433375 ns 1.11
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 387324 ns 388724 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17146 ns 16229.5 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18438 ns 17500 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21500 ns 21916 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17229.5 ns 18542 ns 0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 66199 ns 68340 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 2949345 ns 3114761 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1266312.5 ns 455354.5 ns 2.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76211 ns 75821 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215958 ns 213125 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219125 ns 212125 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215188 ns 214749.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221708 ns 223791 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351090 ns 361191 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 14025976.5 ns 13957207 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5667541.5 ns 5399125 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 469564 ns 468614 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 7584 ns 625 ns 12.13
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 8166.5 ns 667 ns 12.24
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 11750 ns 875 ns 13.43
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 8562.5 ns 708 ns 12.09
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 22778 ns 20782 ns 1.10
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1173529 ns 1176905 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 301791 ns 179000 ns 1.69
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32530 ns 31201 ns 1.04
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2209 ns 1458 ns 1.52
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2417 ns 1500 ns 1.61
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2916.5 ns 1541 ns 1.89
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2375 ns 1333.5 ns 1.78
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 126097.5 ns 128010.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 9653655 ns 9057994 ns 1.07
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1533792 ns 1474521 ns 1.04
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 135982 ns 136491 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 14041 ns 7333 ns 1.91
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 14167 ns 6166 ns 2.30
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 14458 ns 6166 ns 2.34
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16709 ns 10291 ns 1.62
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32839 ns 24318 ns 1.35
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1251313.5 ns 1193537 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 609208 ns 341583 ns 1.78
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 56260 ns 47631 ns 1.18
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227375 ns 231125 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 275292 ns 270583 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 275000 ns 270375 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 261458 ns 213167 ns 1.23
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 202099.5 ns 195209.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 31323717.5 ns 31467862 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8740042 ns 9233666 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 655201 ns 645516 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 22662 ns 23938.5 ns 0.95
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2051373 ns 2014824 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 219958 ns 210750 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 46610 ns 48021 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 21041 ns 16916 ns 1.24
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 21791 ns 17417 ns 1.25
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 22250 ns 17208 ns 1.29
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 20917 ns 16667 ns 1.25
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 205015 ns 198962 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 10110167 ns 10294946 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 975584 ns 900625 ns 1.08
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 182977 ns 172967 ns 1.06
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 509167 ns 508125 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 404417 ns 404416 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 405000 ns 404792 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 864791 ns 865375 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113334.5 ns 113291 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 392728 ns 429336 ns 0.91
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 421604.5 ns 432708 ns 0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 240942 ns 242113 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2318229.5 ns 2329437 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2030833 ns 2034750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2041375 ns 2031750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3280292 ns 3193375 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 250973.5 ns 246406 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 12331811 ns 12521873.5 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 1903125 ns 1893250 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 725307 ns 744268 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5375 ns 5187.5 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7604 ns 7083 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8500 ns 7354 ns 1.16
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6458.5 ns 7542 ns 0.86
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 89376.5 ns 93165 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5476745.5 ns 5491281 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 762334 ns 752833 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 64761 ns 65211 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10583 ns 12167 ns 0.87
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11958 ns 11792 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11958 ns 12374.5 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10792 ns 11396 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 632512 ns 647871 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 38621174 ns 39284056 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5666041 ns 5190667 ns 1.09
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 401324 ns 411409 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 2625 ns 500 ns 5.25
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 2958 ns 541 ns 5.47
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 3250 ns 500 ns 6.50
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 2792 ns 500 ns 5.58
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 30482.5 ns 23724 ns 1.28
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2134963.5 ns 2212056 ns 0.97
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 340083 ns 204584 ns 1.66
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 54341 ns 47141 ns 1.15
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 10750 ns 2125 ns 5.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 11833 ns 2125 ns 5.57
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 13000 ns 2167 ns 6.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 10625 ns 2125 ns 5
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 252151 ns 227021 ns 1.11
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 13218302 ns 11087876.5 ns 1.19
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 1962708.5 ns 1921834 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 189561.5 ns 172882 ns 1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 26500 ns 8208 ns 3.23
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 31771 ns 9146 ns 3.47
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 35000 ns 9959 ns 3.51
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 28479 ns 8375 ns 3.40
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 121854.5 ns 104776 ns 1.16
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3225119 ns 3291769.5 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 730917 ns 468500 ns 1.56
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 80315.5 ns 72700.5 ns 1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22791.5 ns 17374.5 ns 1.31
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25542 ns 18625 ns 1.37
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 25334 ns 18250 ns 1.39
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 23000 ns 18125 ns 1.27
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 616060 ns 580515 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 18160748 ns 17620571 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5306187.5 ns 4970938 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 388424 ns 381279 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1667 ns 459 ns 3.63
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2000 ns 584 ns 3.42
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2167 ns 625 ns 3.47
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 1834 ns 458 ns 4.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 40493 ns 35839 ns 1.13
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1262646 ns 1218575 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 296417 ns 423541 ns 0.70
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 48340 ns 46311 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10000 ns 9104 ns 1.10
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11187.5 ns 9333 ns 1.20
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11958 ns 9083 ns 1.32
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10583 ns 9208 ns 1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 266372 ns 261166 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19255152 ns 18752145 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4716875 ns 4335125 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 379563.5 ns 367929 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 396417 ns 395708 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 287875 ns 288375 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288125 ns 288375 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756000 ns 756292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111465 ns 111964.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 323864.5 ns 329610 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 367958 ns 303771 ns 1.21
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 75531 ns 75611 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1453958.5 ns 1445541 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136125 ns 1129292 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1142437.5 ns 1133875 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2444854 ns 2356333 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 219029 ns 210839 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 10210546 ns 10091107 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1657083 ns 1639416 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 327328 ns 322414 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7042 ns 7042 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8250 ns 8000 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8833 ns 8833.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7209 ns 7520.5 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 141318 ns 142989 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5924265.5 ns 5929780 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 440833 ns 470791.5 ns 0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 65171 ns 66011 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12000 ns 16208 ns 0.74
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14812 ns 14250 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14750 ns 16000 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 11917 ns 15354.5 ns 0.78
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 936057 ns 963872.5 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42717572.5 ns 42665593.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5924541.5 ns 5541125 ns 1.07
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 423354 ns 426829 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23584 ns 24458 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 29312.5 ns 26062.5 ns 1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 31187 ns 29916.5 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24833 ns 25708.5 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 197551 ns 202495.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7700816.5 ns 8124671 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 605479 ns 985584 ns 0.61
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 114941 ns 114461 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 108084 ns 109083 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 124167 ns 152250 ns 0.82
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 154542 ns 152854 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 151166.5 ns 142750 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1062517 ns 1066908 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44916929 ns 41393438 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6076604 ns 5472042 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 587946 ns 588251 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 73750 ns 75167 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 82958 ns 74583 ns 1.11
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78916 ns 84375 ns 0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77042 ns 74125 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 204012 ns 208606 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7603031.5 ns 7473638 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 530625 ns 500875 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 129202 ns 129022 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 209875 ns 304417 ns 0.69
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218333 ns 302145.5 ns 0.72
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 286250 ns 267604 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 224708 ns 221146.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1104104 ns 1119561.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 40856770 ns 40462234 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6448250 ns 6061271 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 693286 ns 695387 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 15687.5 ns 15729.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17458 ns 17541 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 18250 ns 18000 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16812.5 ns 17000 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 144830 ns 148248.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5439912 ns 5730909.5 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 448250 ns 745333 ns 0.60
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 231562 ns 232902 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 24667 ns 26937 ns 0.92
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26229.5 ns 26291.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27083 ns 27291 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 24833 ns 26833.5 ns 0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 963572.5 ns 995021 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 40959335 ns 39941943 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6046333 ns 5463292 ns 1.11
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 687187 ns 692327 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 32208.5 ns 10375 ns 3.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 38208 ns 11875 ns 3.22
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 43375 ns 12562 ns 3.45
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 31459 ns 11625 ns 2.71
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 138477.5 ns 125968 ns 1.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3597639.5 ns 3534875 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 880000 ns 849958 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 243662 ns 236132 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 23270.5 ns 22292 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 23917 ns 21542 ns 1.11
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 25145.5 ns 23416 ns 1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22645.5 ns 22459 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 705404 ns 709781 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 22376022 ns 21081902.5 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5486750 ns 5312812.5 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 671427 ns 671626 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 63271 ns 63000 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 64396 ns 64875 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66666 ns 67624.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 63375.5 ns 70792 ns 0.90
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 106695.5 ns 108732 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3482035 ns 3570568 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1328458 ns 463166.5 ns 2.87
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 236317 ns 233653 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 437479.5 ns 437250 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 464312.5 ns 448250 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 451499.5 ns 451208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 485145.5 ns 443667 ns 1.09
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 511151 ns 523839.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20924955 ns 20377781.5 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6149750 ns 6056791 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 716967 ns 715783 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7542 ns 7104.5 ns 1.06
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7375 ns 8125 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8500 ns 8333 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7125 ns 7729.5 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 142876.5 ns 147799 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5717282 ns 5614298 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 463208.5 ns 704750 ns 0.66
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 64690 ns 65321 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12958 ns 14500 ns 0.89
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13812 ns 15437.5 ns 0.89
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14417 ns 14833 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15458 ns 14146 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 934056 ns 966324 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 41113899 ns 36660688 ns 1.12
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5680771 ns 5256874.5 ns 1.08
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 396764 ns 400984 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6145625 ns 6153708 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6375834 ns 6380458 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6379875 ns 6380979.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11908958 ns 11947959 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 348241 ns 301662 ns 1.15
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 302192.5 ns 322583 ns 0.94
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19047770.5 ns 19056521 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19961208.5 ns 19941000 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19978625 ns 19981146 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36632228.5 ns 36490833.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1017536 ns 1026590 ns 0.99
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1157817 ns 1153502 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 3208 ns 917 ns 3.50
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 3541 ns 959 ns 3.69
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 4084 ns 1000 ns 4.08
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 3250 ns 958 ns 3.39
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 30273 ns 23570 ns 1.28
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2057669 ns 2101433 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 335958 ns 203000 ns 1.65
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 212322 ns 207632 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 11417 ns 3708 ns 3.08
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 12291 ns 3791 ns 3.24
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 15000 ns 3792 ns 3.96
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 11459 ns 3750 ns 3.06
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 300887 ns 284692.5 ns 1.06
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 11434302 ns 11502827.5 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2150875 ns 2063354 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 613366 ns 625846 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 32583 ns 7208 ns 4.52
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 39625 ns 8500 ns 4.66
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 42125 ns 9292 ns 4.53
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 31291 ns 8250 ns 3.79
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 134275.5 ns 122668.5 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3549585 ns 3715127.5 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 782479 ns 787166 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 81161 ns 72740 ns 1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 17959 ns 11875 ns 1.51
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 19937.5 ns 12750 ns 1.56
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 20375 ns 12583 ns 1.62
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 18291 ns 12500 ns 1.46
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 653629 ns 651999 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 21014605.5 ns 22144306 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 4601291 ns 4276208 ns 1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 371429 ns 359014 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 291 ns 334 ns 0.87
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22327 ns 22720.5 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2086130.5 ns 2075647.5 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 324479 ns 205083 ns 1.58
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 46841 ns 47440 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 6791 ns 2875 ns 2.36
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 7208 ns 3500 ns 2.06
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 9375 ns 3333 ns 2.81
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 6667 ns 3208 ns 2.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 215371.5 ns 206663 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 10232521.5 ns 9232071 ns 1.11
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1703500 ns 1552875 ns 1.10
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 166071 ns 156172 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10167 ns 10083 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12875.5 ns 11083 ns 1.16
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13125 ns 12458 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11083 ns 11708 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 120797.5 ns 123476 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3363540 ns 3456473.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 935500 ns 861479.5 ns 1.09
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 233122 ns 236062 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20709 ns 20604 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21875 ns 23187.5 ns 0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21750 ns 23333 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 22625 ns 21042 ns 1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 590585 ns 607311 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20620797 ns 20290582.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4822000 ns 4254667 ns 1.13
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 648361 ns 645431.5 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 6833 ns 4458 ns 1.53
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 7041 ns 4500 ns 1.56
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 7833 ns 4417 ns 1.77
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 6875 ns 4500 ns 1.53
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 31284 ns 24732 ns 1.26
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2217654 ns 2177168 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 229313 ns 211459 ns 1.08
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 52301 ns 47591 ns 1.10
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 26125 ns 16375 ns 1.60
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 27209 ns 16834 ns 1.62
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 30000 ns 16458 ns 1.82
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 25834 ns 16083 ns 1.61
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 347032.5 ns 332546.5 ns 1.04
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 13087814.5 ns 12988178 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1080292 ns 1511750 ns 0.71
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 216482.5 ns 208322 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 3334 ns 2084 ns 1.60
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 3458 ns 2041 ns 1.69
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 3875 ns 2167 ns 1.79
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3417 ns 2209 ns 1.55
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 41491.5 ns 36551 ns 1.14
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1196910.5 ns 1147028 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 397958 ns 268042 ns 1.48
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 206202 ns 204212 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 16917 ns 17396 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20833 ns 17250 ns 1.21
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21208 ns 17812.5 ns 1.19
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17687.5 ns 19479 ns 0.91
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 288016 ns 297836 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21817074 ns 21470855.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5201083 ns 5022375 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 696531.5 ns 686617 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 55459 ns 56395.5 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 64896 ns 65083 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 65583.5 ns 66250 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51541.5 ns 51333 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66456 ns 66767.5 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 113921 ns 115211 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 132500 ns 197187.5 ns 0.67
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 166374.5 ns 163417 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 111500 ns 163937.5 ns 0.68
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 316833 ns 315500 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 217912 ns 219712.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 613066 ns 611147 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80625 ns 105333 ns 0.77
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 125645.5 ns 81834 ns 1.54
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86146 ns 86959 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82959 ns 86750 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193130 ns 191740.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5692161 ns 5593567.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1989666.5 ns 2535645.5 ns 0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 216662.5 ns 204172 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1912792 ns 1915521 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1921187.5 ns 1914333 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1912375 ns 1911750 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1908917 ns 1879292 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 526124 ns 538609 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27266064 ns 24792062.5 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8680374.5 ns 8911395.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1069560 ns 1067201 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 2375 ns 292 ns 8.13
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 2792 ns 292 ns 9.56
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 3459 ns 292 ns 11.85
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 2375 ns 333 ns 7.13
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 28282 ns 22127 ns 1.28
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2020733 ns 2111782 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 355667 ns 320417 ns 1.11
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 46231 ns 41970 ns 1.10
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 9625 ns 1792 ns 5.37
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 13459 ns 1875 ns 7.18
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 14166 ns 1875 ns 7.56
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 9625 ns 1875 ns 5.13
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 270635.5 ns 255417.5 ns 1.06
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 9869416 ns 10493115 ns 0.94
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1067437.5 ns 1487041 ns 0.72
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 195496.5 ns 183032 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7958 ns 7375 ns 1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9854 ns 9562.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11000 ns 11250 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7521 ns 11333 ns 0.66
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 116502.5 ns 121634 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3478612 ns 3330370 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 901250 ns 831000 ns 1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 233553 ns 235863 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 8958 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10042 ns 10917 ns 0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10208 ns 11542 ns 0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8708 ns 9250 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 518097.5 ns 536196 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21197993 ns 20906072 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4329250 ns 3661104.5 ns 1.18
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 626366 ns 620146.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 63291 ns 56833 ns 1.11
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 58084 ns 46333 ns 1.25
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 57292 ns 47000 ns 1.22
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89791 ns 83417 ns 1.08
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 50283 ns 40185 ns 1.25
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1570099.5 ns 1391043 ns 1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1167250 ns 1150167 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 84641 ns 77886 ns 1.09
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1912667 ns 1925959 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1975417 ns 1932875 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1966250 ns 1975666 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1870792 ns 1853417 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 232632 ns 224336 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33812417 ns 33169959 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11151146 ns 11254125 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1177471 ns 1176553 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 415042 ns 416209 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 419333.5 ns 418021.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 422000 ns 423500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 417187.5 ns 417709 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 207638.5 ns 212391.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7531726.5 ns 7928224 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 542958.5 ns 501042 ns 1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 282777.5 ns 283733 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 667541 ns 689875.5 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 748979 ns 744770.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 673708 ns 684250 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 675250 ns 683020.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1040445 ns 1071393 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44753534 ns 45538634 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6673666.5 ns 6134687.5 ns 1.09
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 908713 ns 911264.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3514375 ns 3426041.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3451021 ns 3415458.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3440750 ns 3440084 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3449792 ns 3459083 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 184364 ns 174794 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7909340 ns 8045126 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1385709 ns 1391250 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 425164 ns 426850 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6177354 ns 6168667 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6248791 ns 6210416 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6199834 ns 6205709 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6163750 ns 6247562.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 983055 ns 1017240 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52497500 ns 50293396 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8007792 ns 7732791.5 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1641310.5 ns 1542501 ns 1.06
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 474292 ns 473291 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 345209 ns 342875 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 346500 ns 341396 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 905500 ns 901791 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 54357 ns 46836 ns 1.16
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 389107.5 ns 381391 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 404541 ns 354270.5 ns 1.14
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 247753 ns 243143 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2334208 ns 2332208 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2038708.5 ns 2034354.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2043542 ns 2036500 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3293041.5 ns 3194416 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 278906.5 ns 273644.5 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 8914943 ns 15628377 ns 0.57
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2088084 ns 2136645.5 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 756552 ns 772838 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 62291 ns 56292 ns 1.11
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 58500 ns 45834 ns 1.28
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 56958 ns 46125 ns 1.23
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89250 ns 83209 ns 1.07
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38171 ns 28601 ns 1.33
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1017793 ns 1335147 ns 0.76
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1166417 ns 1124979 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 86475.5 ns 74305.5 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2035708 ns 2016104.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2102479 ns 2087291 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2080937.5 ns 2087917 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2008875 ns 1975958.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 245090 ns 240545 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 38180250 ns 37474096 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11967000 ns 11883709 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1207986.5 ns 1048951 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57709 ns 56542 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 48084 ns 46354.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 49000 ns 46666.5 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83667 ns 83750 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 56367 ns 50752 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 752295 ns 835807 ns 0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1087374.5 ns 1048667 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 80301 ns 78556 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1917209 ns 1921000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1944583 ns 1952958.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1961125 ns 1973000 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1894375 ns 1862417 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 246493.5 ns 246729 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 16926918.5 ns 16959227 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9855333.5 ns 9957875 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1034015 ns 1034211 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1375 ns 292 ns 4.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1667 ns 416 ns 4.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1959 ns 416 ns 4.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1375 ns 292 ns 4.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 39676 ns 35694 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1198863 ns 1211794.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 286209 ns 311771 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 48840 ns 46570 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7375 ns 6604.5 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9083 ns 7291.5 ns 1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9479.5 ns 6666 ns 1.42
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7583 ns 6709 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 213194.5 ns 213644 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 20651072 ns 21642370 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4692500 ns 4349083.5 ns 1.08
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 380024 ns 366543.5 ns 1.04
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32533 ns 32948 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1268230 ns 1191915 ns 1.06
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 254979.5 ns 153792 ns 1.66
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 37420 ns 39081 ns 0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 6042 ns 3208 ns 1.88
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 7083 ns 3041 ns 2.33
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 9333 ns 3083 ns 3.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 6083 ns 3083 ns 1.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 199543 ns 193915 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 8717018 ns 7217530 ns 1.21
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 950520.5 ns 894250 ns 1.06
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 164831 ns 158472 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 437749.5 ns 420583.5 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 487292 ns 420833.5 ns 1.16
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 466021 ns 456166.5 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 442021 ns 426229 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 143480 ns 140216.5 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5976577 ns 6258248 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2179687.5 ns 2682604 ns 0.81
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 370168.5 ns 367294 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3794500 ns 3811479 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3803417 ns 3798000 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3791458 ns 3806125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3801709 ns 3813437.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 707529.5 ns 724543 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35214956 ns 32785400 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10857667 ns 10852833 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1463934 ns 1313993.5 ns 1.11
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49798563 ns 49807062.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35524209 ns 35521583 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35534958 ns 35517479 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97214791.5 ns 97112834 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1600126 ns 1611615 ns 0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1047610 ns 1049140 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 153739771 ns 153740041.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112306958.5 ns 112306083 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112388250 ns 112476667 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 294975583 ns 295356541 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6485489.5 ns 6485483 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5559847 ns 5555702 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 21209 ns 15041.5 ns 1.41
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 20792 ns 18375 ns 1.13
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 20667 ns 16083 ns 1.29
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 23334 ns 15646 ns 1.49
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 23699 ns 21271 ns 1.11
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1173705 ns 1120492.5 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 222416.5 ns 200000 ns 1.11
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 28521 ns 27480 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 11500 ns 10666.5 ns 1.08
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 10000 ns 9042 ns 1.11
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 10375 ns 9437.5 ns 1.10
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 18416 ns 17042 ns 1.08
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 259109.5 ns 267724 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 10048978 ns 10072145 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1578917 ns 1541750 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 147201.5 ns 148171 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 24625 ns 7709 ns 3.19
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 27667 ns 8709 ns 3.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 30500 ns 10708 ns 2.85
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 23937.5 ns 9708.5 ns 2.47
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 137987.5 ns 129031 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3369462 ns 3486446 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 670209 ns 797791 ns 0.84
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 243177.5 ns 234732 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10917 ns 10458.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11500 ns 9833 ns 1.17
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11750 ns 11333.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10792 ns 9125 ns 1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 621051 ns 638866 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22648297 ns 21816663 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 4704896 ns 4208187.5 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 650846 ns 651461.5 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8041 ns 8625.5 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10271 ns 9729 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11125 ns 11521 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9292 ns 11042 ns 0.84
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 119985.5 ns 123974 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3391312.5 ns 3315044 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 895208 ns 859750 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 71901 ns 72471 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13354 ns 17583 ns 0.76
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13667 ns 13458 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13917 ns 15166 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13708 ns 13083 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 585616 ns 608117 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20399248 ns 18976850.5 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4221708 ns 3989167 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 351904 ns 346933 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1542 ns 541 ns 2.85
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1750 ns 625 ns 2.80
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1792 ns 625 ns 2.87
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1583 ns 584 ns 2.71
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 40136 ns 35726 ns 1.12
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1237195 ns 1170850 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 273959 ns 255917 ns 1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 207332 ns 204512 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8750 ns 8604.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9250 ns 7625 ns 1.21
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9291 ns 9250 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8875 ns 7584 ns 1.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 227150.5 ns 237837 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22981390 ns 23133813.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4712916 ns 4454021 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 674086 ns 654907 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 17875 ns 12208 ns 1.46
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 19167 ns 16208 ns 1.18
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 18896 ns 15542 ns 1.22
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 18125 ns 10229 ns 1.77
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 24199.5 ns 22887 ns 1.06
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1135581 ns 1146280 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 208625.5 ns 183250 ns 1.14
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 187926.5 ns 190602 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 32417 ns 31917 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32958 ns 32334 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 33458 ns 32334 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 32625 ns 31792 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 275193 ns 282370 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11054114 ns 12675054 ns 0.87
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1674271 ns 1664375 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 588556 ns 592261 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 455833.5 ns 445708 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 470416.5 ns 440416 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 445500 ns 446125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 442125 ns 462250 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194972.5 ns 194079.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6233916 ns 6009981 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2002875 ns 1948750 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 368743 ns 368473 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3826416.5 ns 3828708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3821625 ns 3827249.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3805291.5 ns 3829459 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3828770.5 ns 3834708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 539774 ns 555671 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27246937 ns 28291601.5 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9665250 ns 9332833 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1360323 ns 1362449 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 787624562.5 ns 836902583.5 ns 0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 541996916 ns 545812333 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 539785459 ns 552742958 ns 0.98
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1557728417 ns 1515431791 ns 1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22543125 ns 22773250.5 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14726018 ns 14681704 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2518400750 ns 3618929167 ns 0.70
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1785169708 ns 1786520209 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1784676208 ns 1811380625 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 5268664750 ns 4749890834 ns 1.11
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 367578104 ns 371829328 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88737971 ns 89064682 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75084 ns 75813 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76541.5 ns 76708 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78958 ns 79437 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 75625 ns 76979 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 206590 ns 213831.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7826047.5 ns 7889207 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 947916 ns 504291 ns 1.88
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 120271 ns 107541 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 193042 ns 268729 ns 0.72
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 278584 ns 283625 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 194458 ns 204145.5 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 249250 ns 192875 ns 1.29
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1038440 ns 1071904.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43499026 ns 42887765 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6277083 ns 5838812.5 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 658001 ns 632041 ns 1.04
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199276312.5 ns 199435500 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139271583 ns 139086375 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139246333 ns 139238083 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388477666 ns 389003125 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5836579.5 ns 5834940 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3573103 ns 3577266 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 619375645.5 ns 616747896 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439498458 ns 438910291 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 439699604.5 ns 439344770.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1187020083 ns 1178749375 ns 1.01
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26508453 ns 26592537.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22071416 ns 22013573 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 13833 ns 7292 ns 1.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 13292 ns 6291 ns 2.11
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 13625 ns 6250 ns 2.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16334 ns 9959 ns 1.64
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37105 ns 28590.5 ns 1.30
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1214237 ns 1242816 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 682166 ns 342708 ns 1.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 56160 ns 46790 ns 1.20
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219750 ns 214875 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228708 ns 220542 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229666.5 ns 223250 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213125 ns 207000 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 233596 ns 227888 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32648557 ns 32088566 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9102583 ns 9056958 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 556036 ns 532636 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8625 ns 7500 ns 1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9083.5 ns 8459 ns 1.07
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10416 ns 11166 ns 0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8125 ns 10125 ns 0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 116194 ns 120432.5 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3482048 ns 3400864 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 900041.5 ns 833917 ns 1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 73561 ns 69170 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7583 ns 11687 ns 0.65
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8084 ns 7875 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8208 ns 9083 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7708 ns 7791.5 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 515823 ns 540200 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19542020 ns 19905821.5 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4141083 ns 3738000 ns 1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 319483 ns 316443 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6709 ns 500 ns 13.42
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 500 ns 14
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7250 ns 583 ns 12.44
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6833 ns 500 ns 13.67
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 35385 ns 26859 ns 1.32
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1225507.5 ns 1218948 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 317896 ns 487291.5 ns 0.65
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 58250 ns 46600 ns 1.25
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15083 ns 12042 ns 1.25
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16500 ns 9500 ns 1.74
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17375 ns 10666 ns 1.63
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15583 ns 9375 ns 1.66
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 263078 ns 259067.5 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 23218391 ns 22720833.5 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5365146 ns 5032208 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 398084 ns 388914 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 111604 ns 105209 ns 1.06
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 106999.5 ns 98958.5 ns 1.08
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 111125 ns 100666 ns 1.10
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 158562.5 ns 146584 ns 1.08
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 27181 ns 26010 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 1210920.5 ns 1202311.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 268208 ns 239416 ns 1.12
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 193052 ns 191122 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 479520.5 ns 478959 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 510437.5 ns 490458 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 480729 ns 483458 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 479354.5 ns 519792 ns 0.92
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 233277 ns 238157 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11412765 ns 11712742 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2209500 ns 2063166.5 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 604431 ns 609226.5 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5021 ns 5459 ns 0.92
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5708.5 ns 6937.5 ns 0.82
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 6333.5 ns 6708 ns 0.94
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 6625 ns 4479 ns 1.48
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16031 ns 17171 ns 0.93
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 84920 ns 84830 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 12729.5 ns 12709 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 11646 ns 11208.5 ns 1.04
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 12146 ns 11979.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17375 ns 16792 ns 1.03
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 216325.5 ns 219500 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 366004 ns 367374 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 35312.5 ns 35250 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51479 ns 51958 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 53042 ns 53333 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13667 ns 13792 ns 0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA 21712 ns 22473 ns 0.97
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 91931 ns 87211 ns 1.05
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 37354.5 ns 37208 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 44104 ns 30979 ns 1.42
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 32958 ns 32729.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57917 ns 57375 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 194626.5 ns 198883 ns 0.98
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 399414 ns 411165 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 8542 ns 1708 ns 5.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 9791.5 ns 1917 ns 5.11
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 11625 ns 2208 ns 5.26
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 9750 ns 2020.5 ns 4.83
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 23397 ns 20890 ns 1.12
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1258960 ns 1182894 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 305375 ns 198895.5 ns 1.54
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 34271 ns 34491 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 3041 ns 2250 ns 1.35
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 3271 ns 2125 ns 1.54
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 3792 ns 2541 ns 1.49
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 3208 ns 2375 ns 1.35
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 206318 ns 209350.5 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 8856545.5 ns 9223044 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1504750.5 ns 1571458 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 141011.5 ns 137241 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4792 ns 3979.5 ns 1.20
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4708.5 ns 4916 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6834 ns 6167 ns 1.11
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4667 ns 5562.5 ns 0.84
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 141147.5 ns 148854.5 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5871936.5 ns 5416916 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 457167 ns 433541 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 68731 ns 69351 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8458.5 ns 8958 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8459 ns 8584 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8750 ns 9375 ns 0.93
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8333 ns 8208 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 861183 ns 901778 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 37057824 ns 39101068.5 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5555937.5 ns 5296271 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 385044 ns 390164 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 58083 ns 56792 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 59084 ns 57792 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 59416 ns 57667 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 59416 ns 58625 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 43710 ns 38676 ns 1.13
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1203059 ns 1256024 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 532666 ns 328000 ns 1.62
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 207252 ns 204982 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 449104.5 ns 454396 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 465666.5 ns 464875 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 467437 ns 465042 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 435520.5 ns 433750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 264164 ns 274516.5 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27468186 ns 27766998 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8246875 ns 7963542 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 831528 ns 840618 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3290708 ns 3290875 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2334854.5 ns 2340916.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2339729 ns 2344208.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6308458 ns 6314083.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204167 ns 205766 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 218552 ns 213542 ns 1.02
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11346209 ns 11352771 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8328312.5 ns 8308208 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8321834 ns 8331229.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21080084 ns 21159458.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 728462 ns 735602 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1058000 ns 1058910.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5083.5 ns 3542 ns 1.44
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6875 ns 6646 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7083 ns 7333 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6604 ns 6875 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 136287.5 ns 141882 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5642827 ns 5384644 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 783520.5 ns 792000 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 55800 ns 56381 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7000 ns 9458 ns 0.74
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7417 ns 7583.5 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7375 ns 7250 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7291.5 ns 7458 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 747674.5 ns 774451.5 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 35501443 ns 37102116 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5585312.5 ns 5116062.5 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 365323 ns 368734 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 110750 ns 95500 ns 1.16
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 127458.5 ns 95041 ns 1.34
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 122542 ns 101334 ns 1.21
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 117167 ns 96958 ns 1.21
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 156753 ns 153183 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6003618 ns 5925151 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2136000 ns 2007167 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 226292.5 ns 218112 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2021500 ns 2021874.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2022042 ns 2010334 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2031021 ns 2025458 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2023917 ns 2005917 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 706711 ns 723141 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31552204.5 ns 33170321 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10690542 ns 10803562.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1254492 ns 1255352 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 28833.5 ns 29750 ns 0.97
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36542 ns 36291.5 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 34917 ns 35000 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 708.5 ns 708 ns 1.00
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15392 ns 15831 ns 0.97
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 79601 ns 80041 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 3250 ns 3417 ns 0.95
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3833 ns 3000 ns 1.28
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3917 ns 2958 ns 1.32
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2834 ns 2292 ns 1.24
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 139825 ns 144997 ns 0.96
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 340743 ns 345563 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 8416 ns 7167 ns 1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 7333 ns 6208 ns 1.18
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 7541 ns 6042 ns 1.25
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 11208 ns 10458 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 42506 ns 37804.5 ns 1.12
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1167392.5 ns 1127358 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 420187.5 ns 324750 ns 1.29
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50571 ns 48830 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213521 ns 213833 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229958 ns 221229 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222791.5 ns 220667 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215375 ns 206167 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 251022 ns 251783 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 25955253.5 ns 25462835 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7930584 ns 7855917 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 574850 ns 579016 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 6209 ns 3917 ns 1.59
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 6375 ns 3958 ns 1.61
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 6459 ns 3917 ns 1.65
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 6125 ns 4167 ns 1.47
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 28584 ns 22588 ns 1.27
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2152846 ns 2083671 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 251125 ns 226542 ns 1.11
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 47090 ns 42771 ns 1.10
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 23167 ns 14916 ns 1.55
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 24166 ns 15083 ns 1.60
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 24375 ns 14916 ns 1.63
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 23292 ns 14792 ns 1.57
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 333019.5 ns 316521 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11744061 ns 11265875.5 ns 1.04
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1014854.5 ns 963479.5 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 208872 ns 193022 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 110000.5 ns 101709 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 148604 ns 99958 ns 1.49
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 126750 ns 106041 ns 1.20
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 133125 ns 102208 ns 1.30
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 148515 ns 142614 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5739795.5 ns 5689078 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2080104 ns 2045292 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 217122 ns 214192 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1912521 ns 1924667 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1906583 ns 1842979 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1884312.5 ns 1918292 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1920187.5 ns 1901125 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 696456 ns 707209 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31821334 ns 31631954.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10487959 ns 10461667 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1218296 ns 1220282 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17542 ns 16604 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22458 ns 18813 ns 1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20771 ns 21271 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18771 ns 18291 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 112142.5 ns 111618 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3422627 ns 3369345 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1340625 ns 464208 ns 2.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 80871 ns 80435.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215875 ns 216042 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 253583 ns 217458 ns 1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 217667 ns 216708.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216417 ns 216395.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 525953.5 ns 534644 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 19492559.5 ns 19551285.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6121020.5 ns 6104084 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 476639.5 ns 481515 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 23979.5 ns 23416.5 ns 1.02
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 32625 ns 30395.5 ns 1.07
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 28250 ns 28583 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1666.5 ns 1250 ns 1.33
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16428 ns 16607 ns 0.99
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 81141 ns 81651 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 5271 ns 4729.5 ns 1.11
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5854.5 ns 4916.5 ns 1.19
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 6437.5 ns 5104.5 ns 1.26
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 5646 ns 4875 ns 1.16
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 215206.5 ns 212757 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 379243.5 ns 378384 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 303000 ns 303792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305416.5 ns 306416.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 308771 ns 308125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 305083 ns 306917 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 231043 ns 235352.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8121882 ns 7753901 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1184000 ns 895000 ns 1.32
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 272543 ns 273893 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 529833.5 ns 532500 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 567729.5 ns 561375 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 533292 ns 533875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 536958.5 ns 538042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1091736 ns 1115910 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45445443.5 ns 43545460 ns 1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6208000 ns 5736646 ns 1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 868528 ns 855458 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 36042 ns 18500 ns 1.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 39083 ns 23125 ns 1.69
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 42458 ns 20875 ns 2.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 37041 ns 20250 ns 1.83
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 131591 ns 117298.5 ns 1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3738074.5 ns 3644245 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1464375 ns 475438 ns 3.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 87560 ns 79291 ns 1.10
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215250.5 ns 213125 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215104.5 ns 227959 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215917 ns 214479.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219083.5 ns 212750 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 768516 ns 769273 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24673008 ns 26817998 ns 0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7384104 ns 7163750 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 532425 ns 536785 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5500 ns 5292 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 6979 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8542 ns 8458.5 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6334 ns 6958 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 140673 ns 144689.5 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5635164.5 ns 5674338 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 772083 ns 763958 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 67510 ns 65951 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10271 ns 9833 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10292 ns 10395.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10833.5 ns 9875 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10875 ns 10166 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 833045.5 ns 843305.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 38670510 ns 40229475 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5336083 ns 5021354 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 390258.5 ns 388453.5 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4709 ns 5083 ns 0.93
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5500 ns 5645.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6709 ns 7354 ns 0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6542 ns 7459 ns 0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 144256 ns 148525.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5567924 ns 5807141 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 792979 ns 768729 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 66931 ns 67441 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7125.5 ns 7459 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7917 ns 7750 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7917 ns 7583 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7604.5 ns 7291 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 790107 ns 806597 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 39023151 ns 38873703 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5547667 ns 5499042 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 391578.5 ns 394693 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14365625 ns 14393541 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10109792 ns 10086042 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10132375 ns 10132625 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27659333 ns 27847083 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 534508 ns 531501 ns 1.01
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 392324 ns 400094 ns 0.98
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 45855833 ns 45837667 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33506395.5 ns 33412125 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33525958 ns 33550792 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85233208 ns 85694750 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2804828.5 ns 2655274 ns 1.06
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3316671 ns 3296132 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 83646 ns 65750 ns 1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 87875 ns 69354 ns 1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 90333 ns 68834 ns 1.31
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 85687.5 ns 67708 ns 1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 124763.5 ns 125224.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3719605 ns 3321446 ns 1.12
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1478042 ns 478792 ns 3.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 248002.5 ns 228082 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 442062 ns 442083 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 451167 ns 452104 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 444167 ns 442208 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 441479 ns 444791 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 747087 ns 744155 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26627384 ns 26781484 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7697145.5 ns 7548250 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 784227 ns 785568 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1750 ns 500 ns 3.50
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 1875 ns 584 ns 3.21
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2000 ns 583 ns 3.43
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 1750 ns 541 ns 3.23
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 38564 ns 33459 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1190356 ns 1181669 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 469896 ns 266750 ns 1.76
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 50030 ns 47690 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10250 ns 9104.5 ns 1.13
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10938 ns 8958 ns 1.22
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11042 ns 9375 ns 1.18
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10625 ns 8333 ns 1.28
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 286642 ns 292729 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21365525.5 ns 21877451 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4815583.5 ns 4421083 ns 1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 389993 ns 376084 ns 1.04
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9833 ns 9834 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9834 ns 9792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9833 ns 9833 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9792 ns 9834 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23280 ns 23819 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2038143 ns 1943243 ns 1.05
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 228792 ns 211083 ns 1.08
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 204342 ns 209072 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 49584 ns 45958 ns 1.08
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 50542 ns 46375 ns 1.09
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 50708 ns 46167 ns 1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 49917 ns 45542 ns 1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 308151 ns 297740 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 9437098 ns 13019378 ns 0.72
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 1545500 ns 1008520.5 ns 1.53
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 603836 ns 610991 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 62834 ns 56250 ns 1.12
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 64292 ns 57125 ns 1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 64333 ns 57125 ns 1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 64250 ns 57708.5 ns 1.11
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 39886 ns 29558.5 ns 1.35
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1257152 ns 1212552 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 638041.5 ns 345084 ns 1.85
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 213412 ns 204882 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 456084 ns 449291.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 488791.5 ns 482958 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 476146 ns 465791 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 491750 ns 434625 ns 1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 263616 ns 253081.5 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32507864 ns 31946764 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9629125 ns 9299875.5 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 891718 ns 887358.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 638875 ns 639500 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 657062.5 ns 610791 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 647917 ns 650021 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 637833 ns 613396 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 209655 ns 213054.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8362863.5 ns 8304459 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1377917 ns 1377667 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 308858 ns 314248 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2231042 ns 2230375 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2234709 ns 2241083 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2231770.5 ns 2226458 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2224542 ns 2044000 ns 1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 969019 ns 1009323.5 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 48212492 ns 48595808 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7164667 ns 10250250 ns 0.70
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1319082 ns 1209503 ns 1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 36750.5 ns 18583 ns 1.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 40083 ns 21500 ns 1.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 42416 ns 22084 ns 1.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 36146 ns 20333 ns 1.78
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 131167.5 ns 115629.5 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3717220.5 ns 3530676 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1489541.5 ns 529396 ns 2.81
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 89901 ns 79871 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221125 ns 219583.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 231999.5 ns 228750 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223062.5 ns 221395.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220250 ns 219500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 745440.5 ns 743488 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 23316075.5 ns 26086313.5 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7764958 ns 7436521 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 549685 ns 556135 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6833 ns 500 ns 13.67
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7208 ns 584 ns 12.34
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7375 ns 584 ns 12.63
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6834 ns 500 ns 13.67
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 33512 ns 24005 ns 1.40
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1223090 ns 1194343 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 444542 ns 283521 ns 1.57
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 57271 ns 47860 ns 1.20
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15833.5 ns 9979 ns 1.59
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17146 ns 10542 ns 1.63
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17041 ns 9687.5 ns 1.76
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16687.5 ns 9916.5 ns 1.68
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 282963.5 ns 274665.5 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24155856 ns 25054245 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5994417 ns 4901583 ns 1.22
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 408498.5 ns 403794 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9458.5 ns 7750 ns 1.22
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9167 ns 8541 ns 1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10416.5 ns 9458 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8541.5 ns 10041 ns 0.85
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 120739.5 ns 122963.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3418426 ns 3342683 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 888833.5 ns 828959 ns 1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 70321 ns 70460 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7583 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7667 ns 7875 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7895.5 ns 7917 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7417 ns 7208 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 513454.5 ns 521824.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17140740 ns 17096205 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 3973625 ns 3622437.5 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 319713 ns 323444 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 9354 ns 1375 ns 6.80
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 9542 ns 1708 ns 5.59
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 10583 ns 1875 ns 5.64
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 9229 ns 1584 ns 5.83
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 24142 ns 22394 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1195519 ns 1154621 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 305208 ns 310833 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 190361 ns 190371.5 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4145.5 ns 3209 ns 1.29
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4208 ns 3333 ns 1.26
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4750 ns 3583 ns 1.33
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4167 ns 3500 ns 1.19
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 226431.5 ns 224060 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10249888.5 ns 9920013 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1679312.5 ns 1731417 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 577155 ns 581006 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 155083 ns 145687 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 136375 ns 128584 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 140958 ns 129625 ns 1.09
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 232833.5 ns 226167 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 26998 ns 25004 ns 1.08
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1222721.5 ns 1165561.5 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 297875 ns 248959 ns 1.20
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 42431 ns 40870 ns 1.04
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 144458 ns 143604 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 127291 ns 130083 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 112104.5 ns 111208 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 252250 ns 251937.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 219049 ns 224391 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10976395 ns 10232573 ns 1.07
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 2074312 ns 1955250 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 265923 ns 267492 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 8583 ns 7208 ns 1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 7292 ns 6083 ns 1.20
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 7292 ns 6000 ns 1.22
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 11333 ns 10458 ns 1.08
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 38422 ns 34049 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1208855 ns 1180224 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 374313 ns 325584 ns 1.15
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 51281 ns 50630 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221417 ns 219688 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229791 ns 237125 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230458.5 ns 228500 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214041.5 ns 212875 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 259283 ns 270641 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27623036 ns 29882407 ns 0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8241896 ns 8193250 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 592306 ns 592361 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 15479 ns 14125 ns 1.10
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15375 ns 15291.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 17458 ns 16792 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15542 ns 16000 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 137835 ns 143262 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5606741 ns 5352196.5 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 778728.5 ns 756916.5 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 231852 ns 233592 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23417 ns 23895.5 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23791 ns 24041.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24000 ns 23542 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23937 ns 23667 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 858271 ns 888831 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 38888220 ns 38279760.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5635500 ns 5301166.5 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 677086 ns 679602 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 26604 ns 8875 ns 3.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 28250 ns 9250 ns 3.05
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 31333 ns 11313 ns 2.77
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 26812.5 ns 9834 ns 2.73
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 137010 ns 126441 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3519222 ns 3425975 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 925417 ns 886021 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 82411 ns 73581 ns 1.12
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14792 ns 14000 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15708 ns 14166.5 ns 1.11
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16000 ns 14541 ns 1.10
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15645.5 ns 13875 ns 1.13
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 668142 ns 686454 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 20824375 ns 21159530.5 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5325770.5 ns 5057854 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 366524 ns 368623 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9312.5 ns 6833 ns 1.36
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9416 ns 9645.5 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10583 ns 10959 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9542 ns 9125 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 121280 ns 125289 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3386034.5 ns 3340336.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 932375 ns 858667 ns 1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 72561 ns 73441 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12354 ns 12750 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13000 ns 12875 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13042 ns 12959 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12458.5 ns 12584 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 545614 ns 568824 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19231212.5 ns 20335817 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4752396 ns 4008167 ns 1.19
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 340553.5 ns 341833 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 26958 ns 26604 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 34792 ns 35042 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 32041.5 ns 31437.5 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1958.5 ns 1958 ns 1.00
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16169 ns 16488 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 80481 ns 80881 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 6042 ns 5354 ns 1.13
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 6208 ns 5271 ns 1.18
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 6520.5 ns 5375 ns 1.21
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6834 ns 6417 ns 1.06
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 141884.5 ns 144829.5 ns 0.98
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 371004 ns 371354 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6458 ns 250 ns 25.83
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6834 ns 417 ns 16.39
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6875 ns 375 ns 18.33
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6375 ns 334 ns 19.09
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 34623 ns 26201 ns 1.32
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1293519 ns 1213684 ns 1.07
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 457312.5 ns 435084 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 56171 ns 47131 ns 1.19
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 12916 ns 6417 ns 2.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 13791 ns 6666 ns 2.07
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 14084 ns 6708 ns 2.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 13042 ns 6541 ns 1.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 198569 ns 192082.5 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 24424265 ns 23595307 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5453125 ns 4957208 ns 1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 396759 ns 388663.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8292 ns 1917 ns 4.33
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8625 ns 2000 ns 4.31
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8833 ns 2042 ns 4.33
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8333 ns 1959 ns 4.25
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 35748 ns 26999 ns 1.32
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1189931 ns 1208214.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 324084 ns 281958 ns 1.15
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 215022 ns 206222 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22770.5 ns 16312.5 ns 1.40
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 23812.5 ns 17020.5 ns 1.40
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 24291.5 ns 16562.5 ns 1.47
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22458 ns 16437.5 ns 1.37
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 284982 ns 281291 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 26514332 ns 25314200 ns 1.05
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5718333 ns 5387167 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 709637 ns 705642 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 149125 ns 148250 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 155917 ns 175104 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 152500 ns 154500 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 148250 ns 148375 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 200827 ns 210020 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7732253 ns 7920169 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1424250.5 ns 1553375 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 214342 ns 236022 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1322854.5 ns 1326125 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1324334 ns 1317625 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1306187.5 ns 1267583 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1319750 ns 1330208 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 894838 ns 941055 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47138720.5 ns 46042204 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6451042 ns 9797270.5 ns 0.66
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1104625 ns 1107606 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25541.5 ns 23542 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25166 ns 25167 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27666 ns 28437.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24084 ns 24917 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 236708 ns 241297.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7576228.5 ns 7644187.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1207792 ns 558625 ns 2.16
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 114481 ns 114946.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 117291.5 ns 174646 ns 0.67
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 119125.5 ns 167916 ns 0.71
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 119021 ns 119708.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 129000 ns 126750 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1066520 ns 1108737 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44035334 ns 45003191 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6154750 ns 5870834 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 614935 ns 610886 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6417 ns 250 ns 25.67
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6750 ns 417 ns 16.19
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6875 ns 375 ns 18.33
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6458 ns 250 ns 25.83
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 32046 ns 23373.5 ns 1.37
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1188985 ns 1207385.5 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 304791.5 ns 274541 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 56421 ns 47321 ns 1.19
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 12958 ns 6458 ns 2.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 13958 ns 6708 ns 2.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 14104 ns 6625 ns 2.13
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12979.5 ns 6521 ns 1.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 219681.5 ns 207930.5 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 24127200 ns 24020738 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5367125 ns 5321979 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 404804 ns 394454 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6042 ns 5125 ns 1.18
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6958 ns 6000 ns 1.16
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8000 ns 7375 ns 1.08
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5812.5 ns 5500 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 143745 ns 148415.5 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5574883 ns 5743209.5 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 721833 ns 438042 ns 1.65
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 232722 ns 233753 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9875 ns 9708.5 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10417 ns 10500 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 10292 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10083.5 ns 10000 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 893962 ns 921993 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 40305685.5 ns 40800221 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 6022625 ns 5516833 ns 1.09
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 667866 ns 673881.5 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 666 ns 625 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 666 ns 625 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 666 ns 666 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22221.5 ns 22961 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2080787.5 ns 2040345 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 253958 ns 205708 ns 1.23
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 206192 ns 207722.5 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 7958 ns 4625 ns 1.72
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 8833 ns 4958 ns 1.78
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 8875 ns 4792 ns 1.85
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 8041 ns 4625 ns 1.74
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 238671.5 ns 232829.5 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10055023.5 ns 11262701.5 ns 0.89
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1611250 ns 1643083.5 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 575715 ns 580356 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 24208 ns 8166 ns 2.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 26562.5 ns 8250 ns 3.22
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 29458 ns 9458 ns 3.11
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 25313 ns 8979.5 ns 2.82
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 134686.5 ns 124075.5 ns 1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3666858.5 ns 3484097 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 819479.5 ns 848979 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 82871 ns 73621 ns 1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9833 ns 8396 ns 1.17
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10750 ns 8584 ns 1.25
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10584 ns 9084 ns 1.17
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9959 ns 8334 ns 1.19
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 592874 ns 601403 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 20598079.5 ns 21381887.5 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4586229.5 ns 4049604 ns 1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 342583 ns 345603 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 125959 ns 123354 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 129958 ns 130833 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 130021 ns 130292 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 181187.5 ns 183083 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45830 ns 46276 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 105671 ns 100861 ns 1.05
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 325125 ns 331291 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 323667 ns 336312.5 ns 0.96
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 316417 ns 332416.5 ns 0.95
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 616792 ns 584792 ns 1.05
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 194713 ns 195249 ns 1.00
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 508449.5 ns 504285 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 400583 ns 396500 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 290666 ns 287958 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 291292 ns 288167 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 759541 ns 756292 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 51490 ns 43813 ns 1.18
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1452694 ns 1397680 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 458875 ns 359646 ns 1.28
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 84931 ns 81271 ns 1.05
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1458459 ns 1447584 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1140687.5 ns 1133917 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1149666.5 ns 1135166.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2451791 ns 2356062 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 274619 ns 251976 ns 1.09
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 10187838 ns 10628240 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1914208 ns 1770646 ns 1.08
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 358283 ns 350644 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 633666 ns 641750 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 663666.5 ns 660333 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 645687.5 ns 656625 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 632541 ns 541646 ns 1.17
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 200663 ns 206977 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8382472 ns 8394592 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1352979.5 ns 1331770.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 307532.5 ns 313564 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2467667 ns 2445250 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2454750 ns 2456229 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2454500 ns 2446833.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2451167 ns 2483750 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 984218.5 ns 1018661.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 50225251.5 ns 53769994.5 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7766292 ns 9019125 ns 0.86
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1380642 ns 1436974 ns 0.96
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 32292 ns 28875 ns 1.12
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 36875 ns 36438 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34000 ns 34354 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 958.5 ns 833 ns 1.15
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15278.5 ns 15679 ns 0.97
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 78690.5 ns 79081 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3792 ns 3125 ns 1.21
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 4333 ns 3333 ns 1.30
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 4583.5 ns 3542 ns 1.29
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 4124.5 ns 3042 ns 1.36
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 140987 ns 141592 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 336043 ns 340828.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 413209 ns 404000 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 415792 ns 408458 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 416395.5 ns 407958 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 427145.5 ns 420750 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54475 ns 44015 ns 1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1394063.5 ns 1346061 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1198125 ns 1099750 ns 1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 250702 ns 240182 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3877833 ns 3854416 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3995771 ns 3977416.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3886792 ns 3995708.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3754728.5 ns 3786812.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 255856 ns 247915 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 37405566.5 ns 38628061.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11943833 ns 11941666 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1432843 ns 1249207.5 ns 1.15
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 4000 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33843 ns 34055 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1197554 ns 1242873 ns 0.96
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 177584 ns 160875 ns 1.10
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 37790.5 ns 38220 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 19500 ns 15625 ns 1.25
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 20083 ns 15958 ns 1.26
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 20375 ns 15958 ns 1.28
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 19708 ns 15625 ns 1.26
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 265715 ns 257530 ns 1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 10200910.5 ns 8798187 ns 1.16
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 870334 ns 839395.5 ns 1.04
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 178112 ns 167922 ns 1.06
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404500 ns 403667 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 296167 ns 295750 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295667 ns 295750 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760375 ns 760166 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 112966 ns 113514 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1017004.5 ns 1017055 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 439666 ns 326291.5 ns 1.35
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 87800 ns 87391 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1477375 ns 1472208 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1147125 ns 1161500 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1158208 ns 1160625 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2470770.5 ns 2378291 ns 1.04
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 253070 ns 245391 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 8892806 ns 10232371 ns 0.87
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1857708 ns 1858625 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 354333 ns 356813 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6708 ns 500 ns 13.42
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7125 ns 583 ns 12.22
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7125 ns 583 ns 12.22
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6666 ns 500 ns 13.33
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 34849 ns 26329.5 ns 1.32
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1195460 ns 1165109.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 444292 ns 458750 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 216212 ns 207592 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 14000 ns 7458 ns 1.88
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 15125 ns 7958 ns 1.90
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 15584 ns 7833 ns 1.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 14250 ns 7500 ns 1.90
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 223556.5 ns 220362.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 27158587 ns 24956286.5 ns 1.09
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5345375.5 ns 4949916.5 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 696921.5 ns 695677 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 832708 ns 824979 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 618166 ns 619166 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 611542 ns 619291 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1540812.5 ns 1521750 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130337.5 ns 130530.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 224742 ns 228943 ns 0.98
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2662417 ns 2673291.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2007708 ns 2003917 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2003084 ns 2004458 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4932771 ns 4938271 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 261909.5 ns 246670.5 ns 1.06
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 835813 ns 761778 ns 1.10
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1375 ns 291 ns 4.73
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1542 ns 375 ns 4.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1583 ns 333 ns 4.75
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1375 ns 250 ns 5.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 36888 ns 32758 ns 1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1237057 ns 1196400 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 366667 ns 263500 ns 1.39
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 49661 ns 46921 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7687.5 ns 6542 ns 1.18
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 6833 ns 1.25
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8291 ns 6667 ns 1.24
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7958 ns 6333 ns 1.26
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 219381 ns 229162.5 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 20912774 ns 21326390 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4917833 ns 4918333 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 375813 ns 360398.5 ns 1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2401916.5 ns 2389042 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2401583 ns 2375416 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2379416 ns 2399208 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2371833 ns 2395167 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 198341.5 ns 205752 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8113331.5 ns 7986200 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2274958 ns 1428354 ns 1.59
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 374084 ns 375378.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4636458 ns 4650833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4653166.5 ns 4663624.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4641125 ns 4666416.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4652750 ns 4657125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 889968 ns 922860 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 46625557 ns 50907571 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6404438 ns 6979416.5 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1356447.5 ns 1386483.5 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 17208.5 ns 13458.5 ns 1.28
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 14583 ns 7333 ns 1.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 16313 ns 7708 ns 2.12
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 21229 ns 6416.5 ns 3.31
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 25470 ns 23918 ns 1.06
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1159611 ns 1244282 ns 0.93
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 267750 ns 235958 ns 1.13
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 42811 ns 40260 ns 1.06
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 45146 ns 46271 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 49833 ns 63375 ns 0.79
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 34417 ns 52500 ns 0.66
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 73000.5 ns 33708.5 ns 2.17
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 218060 ns 220952 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10466633 ns 10877336.5 ns 0.96
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2129250 ns 1059416 ns 2.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 268402 ns 264808 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 20459 ns 20208.5 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 26208 ns 25708 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 25292 ns 24770.5 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5333.5 ns 5291 ns 1.01
batchedmm(2, Bsize=512)/forward/GPU/CUDA 16594 ns 17145 ns 0.97
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 83491 ns 83681 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12541 ns 12646 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 11375 ns 10645.5 ns 1.07
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 11625 ns 10500 ns 1.11
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 19084 ns 18146 ns 1.05
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 227944.5 ns 230722.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 370203 ns 371984 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 409416 ns 405208 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 299958 ns 297166 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 300250 ns 297541 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 765750 ns 762459 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 53976 ns 46892 ns 1.15
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1356509 ns 1423487.5 ns 0.95
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 442125 ns 335000 ns 1.32
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 94470.5 ns 88571 ns 1.07
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1489667 ns 1475875 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1171812 ns 1169208 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1175459 ns 1166834 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2480500 ns 2378771 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 311892 ns 287503 ns 1.08
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 13648236 ns 12647035 ns 1.08
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2072208.5 ns 2003291.5 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 370933 ns 380444 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 435250 ns 432000 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 438084 ns 436541 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 437333 ns 436708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 448333 ns 448208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 61295 ns 54845 ns 1.12
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1050666 ns 1004553 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1135104 ns 1035833 ns 1.10
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 237222 ns 234772.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3895917 ns 3891459 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4001312.5 ns 4027292 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3913375.5 ns 4026478.5 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3807916.5 ns 3684083 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 261286 ns 268195 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 29924983 ns 32271096.5 ns 0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9972333 ns 10269354.5 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1208741 ns 1382008.5 ns 0.87
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 11000 ns 8750 ns 1.26
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 10292 ns 7667 ns 1.34
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 10334 ns 7667 ns 1.35
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 14625 ns 12417 ns 1.18
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 30723 ns 24204 ns 1.27
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2169986 ns 2100905 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 233208.5 ns 211416 ns 1.10
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 215396.5 ns 209352 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52791 ns 45042 ns 1.17
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 53583 ns 45791 ns 1.17
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 54083.5 ns 45208 ns 1.20
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 53125 ns 44959 ns 1.18
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 366013 ns 348332 ns 1.05
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 13310261 ns 12300844.5 ns 1.08
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1891437.5 ns 1700187.5 ns 1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 643336 ns 655376 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 94209 ns 121916.5 ns 0.77
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90833 ns 144917 ns 0.63
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85958 ns 88625 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 126167 ns 105229.5 ns 1.20
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190399.5 ns 189408.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5795468 ns 5999999 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1996458 ns 1936000 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 221047 ns 220412 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2017500 ns 2017208 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2011417 ns 2018750 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1801333.5 ns 2014000 ns 0.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1978875 ns 2017500 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 531205 ns 544732 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28343892 ns 27836425 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9357625 ns 9082333.5 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1089565 ns 961460 ns 1.13

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.