Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: rollback custom gelu implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 25, 2024
1 parent 350b7c7 commit 6aad052
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 39 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.0"
version = "1.3.1"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
38 changes: 0 additions & 38 deletions src/impl/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ CRC.@non_differentiable select_fastest_activation(::Any...)
module SLEEFActivations

using ChainRulesCore: ChainRulesCore
using EnzymeCore: EnzymeCore, EnzymeRules
using NNlib: NNlib
using SLEEFPirates: SLEEFPirates

Expand All @@ -164,32 +163,16 @@ const CRC = ChainRulesCore
sigmoid_fast(x::Number) = SLEEFPirates.sigmoid_fast(x)
softplus(x::Number) = SLEEFPirates.softplus(x)
logsigmoid(x::Number) = -softplus(-x)
gelu(x::Number) = SLEEFPirates.gelu(x)
swish(x::Number) = Base.FastMath.mul_fast(x, sigmoid_fast(x))
lisht(x::Number) = Base.FastMath.mul_fast(x, tanh_fast(x))
tanh(x::Number) = SLEEFPirates.tanh(x)
tanh_fast(x::Number) = SLEEFPirates.tanh_fast(x)

const gelu_λ = (2 / π)
const gelu_2λ = (8 / π)

function ∇gelu(x::Number)
α = oftype(x, 0.044715)
α2 = oftype(x, 0.08943)
λλ = oftype(x, gelu_2λ)
x2 = Base.FastMath.mul_fast(x, x)
t = muladd(x2, α, one(x))
Ω = sigmoid_fast(λλ * x * t)
= conj* (1 - Ω))
return muladd(dσ * λλ * muladd(x2, α2, t), x, Ω)
end

for (f, dfdx) in [
#! format: off
(:sigmoid_fast, :(conj(Base.FastMath.mul_fast(Ω, Base.FastMath.sub_fast(1, Ω))))),
(:softplus, :(sigmoid_fast(x))),
(:logsigmoid, :(sigmoid_fast(-x))),
(:gelu, :(∇gelu(x))),
(:swish, :(Base.FastMath.add_fast(Ω, Base.FastMath.mul_fast(sigmoid_fast(x), Base.FastMath.sub_fast(1, Ω))))),
(:lisht, :(Base.FastMath.add_fast(x, Base.FastMath.mul_fast(tanh_fast(x), Base.FastMath.sub_fast(1, Ω))))),
(:tanh, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω))))),
Expand All @@ -210,26 +193,6 @@ for (f, dfdx) in [
end
end

# Enzyme works for all of these except `gelu`.
# See https://github.com/EnzymeAD/Enzyme.jl/issues/1671
function EnzymeRules.augmented_primal(
cfg::EnzymeRules.RevConfigWidth{1}, func::EnzymeCore.Const{typeof(gelu)},
::Type{<:EnzymeCore.Active}, x::EnzymeCore.Active{<:Number})
primal = EnzymeRules.needs_primal(cfg) ? func.val(x.val) : nothing
return EnzymeRules.AugmentedReturn(primal, nothing, nothing)
end

function EnzymeRules.reverse(
::EnzymeRules.RevConfigWidth{1}, ::EnzymeCore.Const{typeof(gelu)},
dret::EnzymeCore.Active, ::Nothing, x::EnzymeCore.Active{<:Number})
return (dret.val * ∇gelu(x.val),)
end

function EnzymeRules.forward(::EnzymeRules.FwdConfig, ::EnzymeCore.Const{typeof(gelu)},
::Type{<:EnzymeCore.Duplicated}, x::EnzymeCore.Duplicated{<:Number})
return EnzymeCore.Duplicated(gelu(x.val), x.dval * ∇gelu(x.val))
end

fast_act(f::F, ::Type{T}) where {F, T} = f
fast_act(f::F, ::Type{Float32}) where {F} = fast_act(f)

Expand All @@ -238,7 +201,6 @@ for (fbase, ffast) in [
(NNlib.sigmoid_fast, sigmoid_fast),
(NNlib.softplus, softplus),
(NNlib.logsigmoid, logsigmoid),
(NNlib.gelu, gelu),
(NNlib.swish, swish),
(NNlib.lisht, lisht),
(Base.tanh, tanh),
Expand Down

3 comments on commit 6aad052

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/116008

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.1 -m "<description of version>" 6aad0523a7e122c4a08d93aaf2c7eabce6524d34
git push origin v1.3.1

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 6aad052 Previous: 350b7c7 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5291 ns 7625 ns 0.69
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7375 ns 7333 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7687 ns 7437 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6958 ns 5500 ns 1.27
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 111876 ns 88183 ns 1.27
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2746993 ns 2389684 ns 1.15
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 414534 ns 405334 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10041.5 ns 9916.5 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10125 ns 9542 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10167 ns 9792 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10000.5 ns 10000 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 497187 ns 383362 ns 1.30
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 17740695 ns 17679354 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 664206 ns 677366 ns 0.98
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1479.5 ns 2334 ns 0.63
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1459 ns 1500 ns 0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1875 ns 1688 ns 1.11
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1583.5 ns 1729.5 ns 0.92
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 19698 ns 14281 ns 1.38
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1364290 ns 1297688 ns 1.05
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 31630 ns 30200 ns 1.05
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4083 ns 4271 ns 0.96
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4416 ns 4458 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4125 ns 3750 ns 1.10
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3291 ns 3917 ns 0.84
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 130509 ns 106099.5 ns 1.23
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 9003854 ns 9298154.5 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 149371 ns 144956.5 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57958 ns 57333 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46167 ns 46750 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46542 ns 46250 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82541 ns 83708 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36502 ns 30588.5 ns 1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 564405 ns 572856.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 81146 ns 77970 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2037625 ns 2018916 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2078416 ns 2087937.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2083625 ns 2087229 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2000875 ns 1997063 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 216924 ns 182309 ns 1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7524779 ns 7656207 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1725786 ns 1482305 ns 1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 152667 ns 146584 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 168375 ns 174667 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 152437.5 ns 149333.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 193708 ns 178791.5 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167125 ns 167232 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7312313 ns 9038666 ns 0.81
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 213517 ns 197432 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1113104.5 ns 1107750.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1116334 ns 1114208 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1115000 ns 1117604.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1106770.5 ns 1114000.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 628256 ns 537253 ns 1.17
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32195104 ns 35616369 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1026645 ns 1026475 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5166 ns 5291 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4792 ns 4645.5 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5917 ns 5541 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4542 ns 4166 ns 1.09
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 82840 ns 60281 ns 1.37
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5343488 ns 5328970.5 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 67740 ns 70560 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8708 ns 8562.5 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8459 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8708.5 ns 9166.5 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8750 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 548688 ns 414715.5 ns 1.32
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 33264338 ns 33923657 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 384004 ns 387834 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17709 ns 17792 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18625 ns 17708 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21375 ns 21500 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19583.5 ns 17208.5 ns 1.14
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 61770.5 ns 60282.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3180292.5 ns 3008486 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 75881 ns 75721 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212208 ns 212333 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219208.5 ns 212458 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214875 ns 213521 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219958 ns 222750 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 324445 ns 291687 ns 1.11
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 13687318.5 ns 14295306 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 466224 ns 471954.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 583 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 583 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 958 ns 792 ns 1.21
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 583.5 ns 1.14
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 18677 ns 13225 ns 1.41
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1223151.5 ns 1210151 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 31400 ns 30961 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1416.5 ns 1541 ns 0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1542 ns 0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1583 ns 1542 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1416 ns 1416.5 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 114301 ns 92964 ns 1.23
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8986516.5 ns 9171879 ns 0.98
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 135771 ns 134891 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7375 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 6125 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6167 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9958 ns 10125 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23537.5 ns 18616 ns 1.26
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1250837 ns 1243379.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 47255.5 ns 46921 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220688 ns 263062 ns 0.84
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 235896 ns 240459 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229416 ns 228792 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 255458.5 ns 237750 ns 1.07
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 180772.5 ns 154023 ns 1.17
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 30816816.5 ns 32407548 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 642475 ns 637591 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4042 ns 4083 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4166 ns 4167 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4083 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 22833 ns 20561 ns 1.11
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2018204 ns 2115667 ns 0.95
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 46910 ns 46550 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16541 ns 17125 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16834 ns 16750 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17084 ns 16958 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16833 ns 16416 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 182565 ns 174545.5 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 10544759 ns 10156857.5 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 171221 ns 173982 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 493041 ns 509375 ns 0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 385667 ns 405541 ns 0.95
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 386125 ns 404292 ns 0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 847250 ns 864750 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 112997 ns 117562 ns 0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 408156.5 ns 397557 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 242212 ns 240702 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2093437.5 ns 2318458 ns 0.90
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1861958 ns 2034500 ns 0.92
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1876833 ns 2032084 ns 0.92
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3143021 ns 3191167 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 228687 ns 202548 ns 1.13
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 10334254.5 ns 11415659 ns 0.91
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 743867 ns 739097 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6167 ns 5979.5 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6625 ns 6312.5 ns 1.05
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8333.5 ns 8542 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7375 ns 6542 ns 1.13
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 83073.5 ns 84957.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5613807.5 ns 5409712 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 65621 ns 66831 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11042 ns 11937.5 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10958.5 ns 11541.5 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11645.5 ns 11604 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12812.5 ns 10583 ns 1.21
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 595390 ns 561493 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 37940094 ns 37617116 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 408370.5 ns 405534 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23168 ns 20286 ns 1.14
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2210796 ns 2161771 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 46950 ns 51190 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2084 ns 2084 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2209 ns 2208 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 213006.5 ns 223022 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 11081491 ns 10990252.5 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 181582.5 ns 182361 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8834 ns 8625 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8834 ns 9520.5 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10021.5 ns 9334 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8500 ns 7917 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 99705.5 ns 108611 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3198646 ns 3137439.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 72221 ns 74611 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18834 ns 18395.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17479 ns 16917 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18458 ns 18854 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18895.5 ns 18396 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 566743 ns 518312 ns 1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 18116750 ns 16860013 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 377315 ns 380934 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 708 ns 0.82
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 33362 ns 27063 ns 1.23
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1254721 ns 1178178 ns 1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 46210 ns 46160 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8916.5 ns 8500 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9479.5 ns 9020.5 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9667 ns 9208.5 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9250 ns 8937.5 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 255341 ns 166677 ns 1.53
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 18499322 ns 18801518 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 366854.5 ns 371663 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 398042 ns 397208 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288250 ns 288208.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288042 ns 288000 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755958 ns 756583 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112430 ns 110755 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 338275 ns 333813 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 74831 ns 75971 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1408834 ns 1448374.5 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1134937.5 ns 1133083 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1133167 ns 1131833 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2438875 ns 2357875 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 198896 ns 177520.5 ns 1.12
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 10071273 ns 10029153 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 320874 ns 322173 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7270.5 ns 7291.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7583 ns 6875 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8854.5 ns 8666 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6917 ns 7208 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 136778.5 ns 110478 ns 1.24
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5388548.5 ns 5505252 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 66211 ns 65640 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14833.5 ns 12145.5 ns 1.22
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15791 ns 14167 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14229.5 ns 13792 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14709 ns 14729 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 897273 ns 664318.5 ns 1.35
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42742507.5 ns 42216111.5 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 425150 ns 426745 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 27562.5 ns 24770.5 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25583 ns 28375 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 30228.5 ns 30459 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 28854 ns 25729.5 ns 1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 185009 ns 167386 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7764877.5 ns 7615563 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115321 ns 113401 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 106417 ns 151292 ns 0.70
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 151645.5 ns 151187.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 153166 ns 153583 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 150583 ns 143875 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 996849 ns 857621 ns 1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42717639 ns 44631154 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 587287 ns 587816 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74375 ns 79833 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75999.5 ns 85583.5 ns 0.89
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 80375 ns 80437 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 87000 ns 73583 ns 1.18
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 189182 ns 168427.5 ns 1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7755648 ns 7736056 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 128191.5 ns 129412 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 295291 ns 285333 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 319708 ns 300667 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 247791.5 ns 300791.5 ns 0.82
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 273792 ns 222625 ns 1.23
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1010749 ns 971830 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 41903716 ns 41332252 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 697424 ns 696216 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13500 ns 17000 ns 0.79
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13209 ns 16833 ns 0.78
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14167 ns 17125 ns 0.83
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13125 ns 16542 ns 0.79
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 136045 ns 112981 ns 1.20
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5580516 ns 5793916 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 233743 ns 231572 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26604 ns 28083.5 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26187.5 ns 26500 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26625 ns 28083.5 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 28208.5 ns 27187.5 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 900814 ns 696173.5 ns 1.29
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 41017058 ns 41169551 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 690428 ns 689617 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12000 ns 10292 ns 1.17
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11896 ns 11333.5 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12459 ns 11750 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 10833 ns 10250 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 117378.5 ns 111360 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3507903 ns 3372766 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 236503 ns 235923 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22417 ns 23687.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22958 ns 21375 ns 1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 23875 ns 22583 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22583 ns 22375 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 660570 ns 554045 ns 1.19
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 20618992 ns 22400526.5 ns 0.92
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 675828 ns 674936 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 64666 ns 63875 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 67083.5 ns 65292 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66167 ns 66458 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66875 ns 62667 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 100086 ns 96846 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3307399.5 ns 3400257 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 234107.5 ns 235422 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 465000 ns 437167 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 466167 ns 485500 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 468625 ns 486250 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 503833 ns 442291 ns 1.14
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 483663 ns 440935 ns 1.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 21199224 ns 20393573 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 709238 ns 716017 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7562.5 ns 7208 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8083 ns 7250 ns 1.11
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8250 ns 8646 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7083.5 ns 6812.5 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 134375 ns 113059.5 ns 1.19
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5976128 ns 5983032 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 65651 ns 64461 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14041.5 ns 11875 ns 1.18
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13125 ns 13583 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14479 ns 14854.5 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14625 ns 14750 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 872555 ns 670072 ns 1.30
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 40293875 ns 40018921 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 400284 ns 400084 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6157812.5 ns 6149145.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6375333.5 ns 6373791 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6376917 ns 6369958 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11913125 ns 11914917 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 346601.5 ns 348199 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/oneAPI 53593217 ns 55221895 ns 0.97
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 320474 ns 318854 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19110896 ns 19112395.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19977396 ns 19954875 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19903104 ns 19933333 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36496187.5 ns 36546937.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1012562 ns 1032394 ns 0.98
batchedmm(512, Bsize=4)/zygote/GPU/oneAPI 77852170.5 ns 78448314.5 ns 0.99
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1157544 ns 1157393 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1000 ns 958 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1000 ns 958 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1042 ns 1000 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 958 ns 958 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 22944 ns 20220 ns 1.13
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2044697 ns 2011379 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 207642 ns 207432 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3917 ns 3667 ns 1.07
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4000 ns 3667 ns 1.09
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4042 ns 3750 ns 1.08
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4000 ns 3625 ns 1.10
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 269119 ns 242662 ns 1.11
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 11661739 ns 11613706.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 625997 ns 625907 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8437.5 ns 7229.5 ns 1.17
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8895.5 ns 8208 ns 1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9562 ns 9063 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8375 ns 7833 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 113535 ns 110132.5 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3443497.5 ns 3376276 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 68271 ns 72491 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11896 ns 11792 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12021 ns 11708 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12792 ns 12833 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12583 ns 12042 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 597497 ns 533463.5 ns 1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 21602127 ns 22224767.5 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 354444 ns 357164 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22361 ns 20014 ns 1.12
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 1916584 ns 2044805 ns 0.94
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 46890 ns 46611 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3000 ns 3167 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2958 ns 2875 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3333 ns 3083 ns 1.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2917 ns 2834 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 193738 ns 168487.5 ns 1.15
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9462333 ns 9185467 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 156212 ns 163482 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10583 ns 11500 ns 0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11875 ns 11292 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13083.5 ns 13562.5 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12062.5 ns 9687.5 ns 1.25
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 113976.5 ns 110742.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3275659 ns 3318937 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 236063 ns 234383 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21833.5 ns 22041.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 22145.5 ns 21312.5 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23875 ns 22292 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 22333 ns 21375.5 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 547934 ns 445412.5 ns 1.23
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20491745 ns 20307385 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 654438 ns 648033 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4458 ns 4417 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4417 ns 4417 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4417 ns 4375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 23860 ns 21103 ns 1.13
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2144860.5 ns 2254531 ns 0.95
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 49061 ns 47271 ns 1.04
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16375 ns 16542 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16666 ns 16458 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16666 ns 16667 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16541 ns 16542 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 316685 ns 292441 ns 1.08
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12062386.5 ns 12584045 ns 0.96
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 209243 ns 206702.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 2041 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2084 ns 2042 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2209 ns 2083 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2208 ns 1916 ns 1.15
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34477 ns 27885 ns 1.24
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1229094 ns 1248055 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 203202 ns 203262 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 18604 ns 16833 ns 1.11
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 18708 ns 18250 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 18833.5 ns 18125 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21208.5 ns 17667 ns 1.20
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 282309 ns 178504 ns 1.58
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21098361 ns 21525405 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 686013 ns 684992 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59292 ns 59104 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 64917 ns 65041 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 66458 ns 66583.5 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51625 ns 51125 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66488 ns 71334 ns 0.93
batchedmm(16, Bsize=512)/forward/GPU/oneAPI 88258686 ns 89279199 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 118491 ns 118362 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 175916.5 ns 163062.5 ns 1.08
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 153479 ns 151271 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 160333.5 ns 157250 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 224542 ns 313146 ns 0.72
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 208290.5 ns 195169 ns 1.07
batchedmm(16, Bsize=512)/zygote/GPU/oneAPI 149475929.5 ns 151578490.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 608982 ns 624817 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81083 ns 82145.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83270.5 ns 82749.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 124833.5 ns 86667 ns 1.44
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85395.5 ns 85000 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192029 ns 186525 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5900244 ns 5756836 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 202972.5 ns 205352 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1881145.5 ns 1808020.5 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1912667 ns 1915916.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1916083 ns 1905270.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1849250 ns 1911375 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 499932 ns 475978 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 26802673 ns 27045542 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1066872 ns 1069182 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21422.5 ns 18638 ns 1.15
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2063314.5 ns 2108817.5 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 41850 ns 42830 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1834 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1875 ns 1791 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 241279.5 ns 225657 ns 1.07
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 9975087 ns 9833710 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 180262 ns 182527.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8166 ns 8375 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10292 ns 9125 ns 1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11208 ns 11083 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 11042 ns 8041 ns 1.37
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 113299.5 ns 108185.5 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3500381.5 ns 3365841 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 233333 ns 232582 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9917 ns 10167 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9834 ns 9542 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 11458 ns 10417 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10417 ns 9291 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 484445 ns 420282.5 ns 1.15
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18749564 ns 20467429 ns 0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 627157 ns 629687 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58375 ns 57916 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47209 ns 46583 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46833 ns 46458 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82625 ns 83583 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38276 ns 32500 ns 1.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1341940 ns 1374457 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 75211 ns 72281 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1836770.5 ns 1911000 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1985937.5 ns 1970187.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1978479 ns 1937771 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1854291.5 ns 1899667 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 209126 ns 176646 ns 1.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33357124 ns 33503348 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1011361 ns 1152023 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267437.5 ns 418084 ns 0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 270417 ns 417375 ns 0.65
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 270625 ns 427542 ns 0.63
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 268604.5 ns 420250 ns 0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 193011.5 ns 173254.5 ns 1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7986425 ns 7736703 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 282544 ns 280773 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 588125 ns 671833.5 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 688229.5 ns 766666.5 ns 0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 688292 ns 684542 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 593500 ns 731041.5 ns 0.81
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 985216 ns 887279 ns 1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43272459 ns 46741128 ns 0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 911561 ns 905534.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2205542 ns 3464375 ns 0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2194083.5 ns 3437833 ns 0.64
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2213708 ns 3397500 ns 0.65
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2176167 ns 3449958 ns 0.63
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 153511 ns 148014 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8157200 ns 8945738 ns 0.91
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 445380 ns 441160 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5508979.5 ns 6193666.5 ns 0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5521979 ns 6178645.5 ns 0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5474458 ns 6207958 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5531895.5 ns 6230917 ns 0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 925959.5 ns 821729 ns 1.13
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 50527002 ns 51511265 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1539832.5 ns 1636158 ns 0.94
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 478666 ns 473083.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 346145.5 ns 342041.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 346083 ns 341500 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 909333 ns 902375 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46203 ns 42882 ns 1.08
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 382606 ns 400566 ns 0.96
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 242913 ns 241152 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2111749.5 ns 2324750 ns 0.91
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1861166.5 ns 2038541.5 ns 0.91
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1866541 ns 2032354 ns 0.92
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3130375 ns 3197000 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 258500 ns 202331 ns 1.28
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 15052922 ns 12642725 ns 1.19
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 773039 ns 763338 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58125 ns 57520.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46334 ns 46395.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46167 ns 45959 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82542 ns 83250 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 27952 ns 23227 ns 1.20
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1310631 ns 1432334 ns 0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 73681 ns 75651 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2039458 ns 2029625 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2089729.5 ns 2079979 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2087020.5 ns 2070791 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1978124.5 ns 2000354 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 221951 ns 191580 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 36802380 ns 35959863 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1041362 ns 1041881.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58417 ns 57291 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46958 ns 46645.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46417 ns 46625 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82334 ns 83334 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47697.5 ns 40746 ns 1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 816463 ns 810264 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 71371 ns 80396 ns 0.89
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1926479.5 ns 1890166 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1973250 ns 1976042 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1973167 ns 1971667 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1898833 ns 1895583 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 228428 ns 198522 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 17513790 ns 17337732 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1026836.5 ns 936080 ns 1.10
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 291 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 416 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 334 ns 333 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 33167 ns 25307.5 ns 1.31
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1174385.5 ns 1259521 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 48501 ns 46650 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6416 ns 6562.5 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6834 ns 6917 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7250 ns 7292 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6834 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 199581 ns 168328.5 ns 1.19
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 19880225 ns 20601648 ns 0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 363764 ns 371864 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32517 ns 30302 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1265101 ns 1177600.5 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 37771 ns 37815.5 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3417 ns 3542 ns 0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3375 ns 2833 ns 1.19
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3167 ns 3250 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2792 ns 2959 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 182053 ns 169119 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 9212477.5 ns 7614831 ns 1.21
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 158127 ns 152811 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 460687.5 ns 450021 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 478208.5 ns 441041 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 500000 ns 425041.5 ns 1.18
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 470937 ns 422292 ns 1.12
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134071 ns 130746.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5855749 ns 6115924 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 366189 ns 366698.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4078667 ns 3801375 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4067771 ns 3799958 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4080625 ns 3805000 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4056354 ns 3829062.5 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 664164.5 ns 640512 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31731318 ns 35444962 ns 0.90
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1467136 ns 1468321 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49955792 ns 49831750 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35488958 ns 35529708 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35531584 ns 35490875 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97090437.5 ns 97095125 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1601101.5 ns 1612269 ns 0.99
batchedmm(512, Bsize=32)/forward/GPU/oneAPI 55729446 ns 56680008 ns 0.98
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1044391.5 ns 1041171 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154677542 ns 154466500.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112413020.5 ns 112376375 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112347584 ns 112311958 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 295444937.5 ns 295244375 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6489665.5 ns 6476168 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/oneAPI 124609188 ns 174388525 ns 0.71
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5586056.5 ns 5549710 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 19520.5 ns 16979 ns 1.15
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 17458 ns 19562.5 ns 0.89
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 17417 ns 17188 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15750 ns 15020.5 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19821 ns 14071 ns 1.41
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1180885 ns 1254861 ns 0.94
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 26420 ns 25910 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10959 ns 10520.5 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 9125.5 ns 8709 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9084 ns 8917 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17167 ns 17479 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 242736.5 ns 209068 ns 1.16
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 10064346.5 ns 10230351.5 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 149096.5 ns 148622 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8208.5 ns 7750 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 10687.5 ns 7854.5 ns 1.36
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10604.5 ns 10334 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8959 ns 7583 ns 1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 116211.5 ns 111568.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3615906 ns 3718095.5 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 234342 ns 237553 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10292 ns 11541.5 ns 0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10209 ns 9687.5 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11292 ns 10708 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9437.5 ns 10709 ns 0.88
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 575593 ns 501739 ns 1.15
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22955140 ns 23065545 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 652487 ns 655677 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9250 ns 8770.5 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9875 ns 9750 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11041.5 ns 10583 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10333.5 ns 8750 ns 1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 113252 ns 53968 ns 2.10
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3518835.5 ns 3498205 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 72611 ns 72631 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 16542 ns 13459 ns 1.23
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 15833 ns 15479 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 15750 ns 19209 ns 0.82
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16708 ns 14125 ns 1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 548922 ns 250540 ns 2.19
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19847936.5 ns 20620278 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 343724 ns 346043 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 584 ns 458 ns 1.28
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 33202 ns 26861 ns 1.24
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1238450.5 ns 1254571 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 204092 ns 204762 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8916 ns 7208.5 ns 1.24
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9292 ns 9000 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9958 ns 9125 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 12292 ns 8166 ns 1.51
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 220226.5 ns 147122.5 ns 1.50
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21905010 ns 22634021 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 657387 ns 659287 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 17625 ns 15416 ns 1.14
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 15958 ns 16625 ns 0.96
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 15209 ns 14917 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 11291 ns 11291 ns 1
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 19970 ns 13973 ns 1.43
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1162812 ns 1108916 ns 1.05
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 188782 ns 186562 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 35458 ns 32000 ns 1.11
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 35562 ns 32000 ns 1.11
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 35645.5 ns 31958 ns 1.12
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 35542 ns 32167 ns 1.10
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 255892 ns 109160 ns 2.34
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 10845224.5 ns 11487029 ns 0.94
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 591957 ns 588817 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 448958 ns 492875 ns 0.91
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 453750 ns 442125 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 492166 ns 444958 ns 1.11
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 453875 ns 440604 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193846 ns 188096.5 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6007739 ns 5891615 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 367744 ns 369779 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4069208 ns 3834584 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4054291.5 ns 3827292 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4049270.5 ns 3817250 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4057500 ns 3836104.5 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 505408 ns 382999 ns 1.32
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 37330396 ns 28452071 ns 1.31
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1362695 ns 1355634 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 779601166 ns 831622791.5 ns 0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 542496166 ns 544951167 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 539989666 ns 544430500 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1569938708 ns 1552948271 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22536712.5 ns 22763244.5 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/oneAPI 187859757.5 ns 185795205 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14732780 ns 15420059 ns 0.96
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2505560125 ns 3888050458 ns 0.64
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1783555333 ns 3211667750 ns 0.56
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1792629375 ns 1819585250 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 5216869375 ns 4769468292 ns 1.09
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118336848 ns 118595684 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/oneAPI 935397218 ns 1039230192 ns 0.90
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88936600 ns 88183228 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 78854.5 ns 75333.5 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76791 ns 77458 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 79000 ns 78584 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 79354 ns 76292 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 190682.5 ns 93335 ns 2.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 5473671 ns 6083372 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 108351 ns 120232 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 294125 ns 279333 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 289958 ns 194937.5 ns 1.49
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 261417 ns 234771 ns 1.11
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 238520.5 ns 194125 ns 1.23
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 986968 ns 451188 ns 2.19
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 46526863 ns 46239896 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 636237 ns 657366.5 ns 0.97
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199699479 ns 199509499.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139060584 ns 139162834 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139030750 ns 138977625 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388620875 ns 388989959 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5814292 ns 5833602 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/oneAPI 80005938 ns 79568180.5 ns 1.01
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3574368 ns 3573358 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 621021958 ns 619161479.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439183125 ns 440796833 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 439329875 ns 439294646 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1194801708 ns 1189363000 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26594102 ns 26219564 ns 1.01
batchedmm(512, Bsize=128)/zygote/GPU/oneAPI 295168041 ns 283162239 ns 1.04
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22131978 ns 21927537.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7417 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6291 ns 6083.5 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6208 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9959 ns 10042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26445 ns 21654 ns 1.22
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1270170 ns 1302067 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48281 ns 48161 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216209 ns 212583 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220416.5 ns 228396 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222625 ns 222250 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219125 ns 213166.5 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 214078.5 ns 136607.5 ns 1.57
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 29452909 ns 29564519.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 522765 ns 524845 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9416.5 ns 9833.5 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9041 ns 7979 ns 1.13
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9833.5 ns 10250 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10791.5 ns 7978.5 ns 1.35
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 110026 ns 51011 ns 2.16
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3375913.5 ns 3317085 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 72150 ns 69811 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10854.5 ns 7333.5 ns 1.48
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 9625 ns 0.81
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9708 ns 13562.5 ns 0.72
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 11250 ns 8250 ns 1.36
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 484552.5 ns 242632 ns 2.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18934737 ns 19151322 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 313639 ns 316738.5 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 459 ns 416 ns 1.10
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 708 ns 0.71
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 667 ns 500 ns 1.33
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24574 ns 19752 ns 1.24
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1221655.5 ns 1203125.5 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 46721 ns 46481 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12292 ns 8625 ns 1.43
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8896 ns 10249.5 ns 0.87
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10583 ns 9500 ns 1.11
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13666 ns 10292 ns 1.33
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 243553 ns 119755.5 ns 2.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 23025269 ns 25677237 ns 0.90
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 386704 ns 388684 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 111083 ns 105959 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 102541.5 ns 98500 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 103792 ns 101021 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 155083.5 ns 146271 ns 1.06
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 22624 ns 16996 ns 1.33
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 791164 ns 756914 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 191512 ns 190327 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 567500 ns 478333 ns 1.19
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 573417 ns 509583 ns 1.13
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 549583.5 ns 478459 ns 1.15
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 537292 ns 478458.5 ns 1.12
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 213930 ns 113991 ns 1.88
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11700863.5 ns 12514796 ns 0.93
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 608337 ns 604977 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5750 ns 5375 ns 1.07
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5167 ns 5333 ns 0.97
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7667 ns 7208 ns 1.06
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4563 ns 6729 ns 0.68
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16559 ns 15434 ns 1.07
batchedmm(16, Bsize=32)/forward/GPU/oneAPI 73950991 ns 73679048 ns 1.00
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 80275.5 ns 79381 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 11958 ns 12375 ns 0.97
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10750 ns 11000 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11583 ns 10875 ns 1.07
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 18167 ns 16625 ns 1.09
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 203546 ns 108121 ns 1.88
batchedmm(16, Bsize=32)/zygote/GPU/oneAPI 98437217 ns 100453387 ns 0.98
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 367244 ns 364504 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 38958 ns 39375 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51125 ns 51917 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52458 ns 52770.5 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13770.5 ns 13604 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA 20666.5 ns 20011 ns 1.03
batchedmm(16, Bsize=128)/forward/GPU/oneAPI 77382892 ns 79258230 ns 0.98
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 87361 ns 85481 ns 1.02
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 36416 ns 36271 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 30770.5 ns 35313 ns 0.87
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 35250 ns 31291.5 ns 1.13
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 58812.5 ns 57750 ns 1.02
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 180756 ns 121997.5 ns 1.48
batchedmm(16, Bsize=128)/zygote/GPU/oneAPI 110794943 ns 113144013 ns 0.98
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 408754 ns 410244.5 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1750 ns 1584 ns 1.10
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1750 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2125 ns 2250 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1833 ns 1687.5 ns 1.09
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 19320 ns 13818 ns 1.40
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1202099 ns 1224877 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 33080 ns 32640 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2395.5 ns 2166 ns 1.11
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2333 ns 2292 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2375 ns 2417 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2375 ns 2250 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 193868.5 ns 89827 ns 2.16
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 9197836 ns 9149897 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 137016.5 ns 136461 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5292 ns 5666.5 ns 0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5750 ns 4896 ns 1.17
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6208 ns 6333.5 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5958.5 ns 5375 ns 1.11
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 139204.5 ns 59437.5 ns 2.34
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5728892 ns 5810721.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 69071 ns 68755.5 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8667 ns 8167 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8625 ns 8542 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8792 ns 8500 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9145.5 ns 9042 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 809144 ns 383098.5 ns 2.11
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 39925856 ns 38586019 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 387074 ns 387674 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55125 ns 56708 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 55958 ns 57666 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 56042 ns 57625 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 56208 ns 58250 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35813.5 ns 30235 ns 1.18
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1246242 ns 1254024 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 202752 ns 204092 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 489125 ns 448000 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 532541.5 ns 472083.5 ns 1.13
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 505645.5 ns 465125 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 470521 ns 436541.5 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 253767 ns 170026 ns 1.49
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26667416 ns 28109365.5 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 833929 ns 826388 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3319083.5 ns 3312500 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2337292 ns 2340084 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2337917 ns 2339583.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6313500 ns 6318792 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204383 ns 204725 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/oneAPI 80623182 ns 83409682 ns 0.97
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 213737 ns 240632 ns 0.89
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11497229 ns 11441604 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8328208.5 ns 8301208 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8338541.5 ns 8329792 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21078124.5 ns 21184729.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 737191.5 ns 760406.5 ns 0.97
batchedmm(128, Bsize=128)/zygote/GPU/oneAPI 126245472 ns 125395684.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1058001 ns 1063686 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4750 ns 5666 ns 0.84
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6875 ns 5604.5 ns 1.23
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6874.5 ns 6438 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6791.5 ns 6312.5 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 130168 ns 57453 ns 2.27
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5745155 ns 5296827 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 56791 ns 56241 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7333 ns 7125 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7312.5 ns 7333 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9042 ns 7250 ns 1.25
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7375 ns 8292 ns 0.89
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 712471 ns 367190 ns 1.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 35913527 ns 35508394 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 368228.5 ns 362159 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 150042 ns 140708 ns 1.07
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 93750 ns 123917 ns 0.76
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 126666 ns 100667 ns 1.26
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 97708 ns 104958 ns 0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 148678 ns 127546.5 ns 1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5748457.5 ns 6179687.5 ns 0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203522 ns 206197 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036375 ns 1992625 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2027000.5 ns 2016083.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2032104 ns 2019875 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2023625 ns 2026687.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 663877 ns 432468 ns 1.54
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33751272 ns 33529611.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1110211 ns 1184812.5 ns 0.94
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 34208 ns 32208.5 ns 1.06
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36458 ns 37167 ns 0.98
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 36083 ns 35833 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 708 ns 583 ns 1.21
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15530 ns 13995 ns 1.11
batchedmm(2, Bsize=4)/forward/GPU/oneAPI 73822262.5 ns 74212471 ns 0.99
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 78911 ns 79370 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2542 ns 2645.5 ns 0.96
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2833.5 ns 2750 ns 1.03
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3500 ns 3020.5 ns 1.16
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2209 ns 2333 ns 0.95
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 136004.5 ns 92140 ns 1.48
batchedmm(2, Bsize=4)/zygote/GPU/oneAPI 93721653 ns 94219800 ns 0.99
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 339263 ns 341683 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7167 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6084 ns 6000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6083 ns 6083 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35188 ns 29283 ns 1.20
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1207919 ns 1222925.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48090 ns 48131 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 244041.5 ns 248271 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227416.5 ns 221125 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 224625 ns 221042 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216417 ns 216625 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 236066 ns 158765.5 ns 1.49
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28254417 ns 26714332 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 573176 ns 569975.5 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 4000 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21615 ns 18821 ns 1.15
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2072507 ns 2189549 ns 0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42031 ns 41970 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14666 ns 14958 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14958 ns 15125 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14916.5 ns 14917 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14917 ns 14666 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 297040 ns 163909.5 ns 1.81
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11259133 ns 11534435 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 188487 ns 192582 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 120896 ns 146416 ns 0.83
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 103687.5 ns 103750 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 130792 ns 103791 ns 1.26
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 100583 ns 100208 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 149147 ns 127104 ns 1.17
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6201158 ns 6092161.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 204362 ns 207422.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1925625 ns 1791000 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1922584 ns 1909958 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1924687.5 ns 1910875 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1918000 ns 1922250 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 656144 ns 418877 ns 1.57
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 29883253.5 ns 29586225 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1218242.5 ns 1089381 ns 1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19042 ns 17291 ns 1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21375 ns 22583 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20375 ns 21062.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18625 ns 17417 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102936.5 ns 61422.5 ns 1.68
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3227536 ns 3492174 ns 0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 80560.5 ns 80420 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216541.5 ns 216354.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 239771 ns 256145.5 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223709 ns 216500 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 247021 ns 219146 ns 1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 493608 ns 272581 ns 1.81
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 19781802 ns 19535498.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 479335 ns 477435 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 24792 ns 26813 ns 0.92
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 30625 ns 31333 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 29334 ns 28812.5 ns 1.02
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1291 ns 1312 ns 0.98
batchedmm(16, Bsize=4)/forward/GPU/CUDA 15803 ns 14764 ns 1.07
batchedmm(16, Bsize=4)/forward/GPU/oneAPI 73776659 ns 75193108 ns 0.98
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 81571 ns 81295.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4770.5 ns 5000 ns 0.95
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5125 ns 4833.5 ns 1.06
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5396 ns 5083.5 ns 1.06
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4500 ns 4854 ns 0.93
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 200310 ns 110219 ns 1.82
batchedmm(16, Bsize=4)/zygote/GPU/oneAPI 93984240 ns 96235287 ns 0.98
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 379744 ns 380423.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 226292 ns 304917 ns 0.74
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 223000 ns 306417 ns 0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 224167 ns 307500 ns 0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 224042 ns 306312 ns 0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 218225 ns 96559 ns 2.26
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7741481.5 ns 8040746 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 274597.5 ns 273553 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 510000 ns 534959 ns 0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 507812.5 ns 578875 ns 0.88
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 554604 ns 532250 ns 1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 496958 ns 532292 ns 0.93
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1023666.5 ns 478700.5 ns 2.14
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43947953 ns 45273096.5 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 871339 ns 854594 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21084 ns 18833 ns 1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19896 ns 21500 ns 0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21792 ns 21500 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20375 ns 18729 ns 1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 110346 ns 61059 ns 1.81
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3666139 ns 3648054 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79030 ns 79701 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212625 ns 225292 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220750 ns 215459 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216750 ns 214416 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215292 ns 215625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 706350 ns 315781.5 ns 2.24
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24785967 ns 25685453.5 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 536195 ns 536640.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6292 ns 6875 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6500 ns 6729 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8667 ns 7875.5 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7083 ns 6187 ns 1.14
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 133065.5 ns 59473 ns 2.24
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5772071 ns 5742399 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 65491 ns 65660 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10416 ns 9875 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9750 ns 10541.5 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10291 ns 10542 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10959 ns 11395.5 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 769680.5 ns 375474.5 ns 2.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 39162885 ns 37560344 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 386734 ns 385404 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5333 ns 4958 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5792 ns 5792 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7416 ns 6937 ns 1.07
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6833 ns 4813 ns 1.42
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 135171 ns 59336 ns 2.28
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5588783 ns 5881412.5 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 68351 ns 66901 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7666 ns 7333 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7292 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7646 ns 7625 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7458 ns 7917 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 735742.5 ns 400389 ns 1.84
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 40095077.5 ns 41438719.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 390854 ns 390804 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14524375 ns 14514708 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10107916 ns 10142334 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10121667 ns 10128041 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27752459 ns 27891250 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 528552 ns 532579.5 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/oneAPI 98194699 ns 99192089 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 402574 ns 394344 ns 1.02
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46533666.5 ns 46256625 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33493083 ns 33475978.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33509291 ns 33502666 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85143125 ns 85530791 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2860259.5 ns 3411776.5 ns 0.84
batchedmm(128, Bsize=512)/zygote/GPU/oneAPI 194954753.5 ns 197868624 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3304313.5 ns 3281874 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 67916 ns 66792 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 66833.5 ns 67791 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69271 ns 69583 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 70875 ns 66542 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 101002.5 ns 63585 ns 1.59
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3722652 ns 3635639 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 234933 ns 238943 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 480708.5 ns 482792 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 482250 ns 490208.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 478395.5 ns 443416 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 469167 ns 443250 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 703449 ns 333625.5 ns 2.11
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28256303 ns 27824814.5 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 791863.5 ns 796928.5 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 584 ns 541 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32037 ns 26261 ns 1.22
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1235389 ns 1201042 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 49701 ns 46591 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8959 ns 9624.5 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8667 ns 9458 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9562.5 ns 9042 ns 1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8250 ns 15042 ns 0.55
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 277434 ns 154087.5 ns 1.80
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 22231771.5 ns 22365155 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 376664 ns 376374 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9625 ns 9792 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9708 ns 9875 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9667 ns 9834 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9625 ns 9792 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23071 ns 21245 ns 1.09
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2068828 ns 2109275 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 210172 ns 207407 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 50291 ns 45834 ns 1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 50417 ns 46083 ns 1.09
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 50875 ns 48125 ns 1.06
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 50583 ns 46000 ns 1.10
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 274332 ns 181985 ns 1.51
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12829198 ns 12501764 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 609897 ns 599026 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55125 ns 56292 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 55875 ns 57208 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 55917 ns 57083 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 55917 ns 57875 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27736 ns 22599 ns 1.23
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1172928.5 ns 1231700.5 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 203487.5 ns 210062.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 530167 ns 491041.5 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 505208 ns 503250 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 508854.5 ns 465875 ns 1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 467249.5 ns 440959 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 234697 ns 153666 ns 1.53
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 33825831.5 ns 33436886 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 884779 ns 880644 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 652145.5 ns 646396 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645666 ns 656479 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 651250 ns 592854.5 ns 1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 641645.5 ns 616145.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 186754 ns 128259.5 ns 1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8762924.5 ns 8403444.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 301613 ns 302363 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2247709 ns 2232145.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2260312.5 ns 2230708 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2234500 ns 2231875 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2220417 ns 2259375 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 904539 ns 617840 ns 1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 49644380 ns 50658009 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1208692 ns 1318863 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21125 ns 22542 ns 0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23708.5 ns 19458 ns 1.22
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22229 ns 22583 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22083 ns 19458 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 106895 ns 64266 ns 1.66
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3512219 ns 3671624.5 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79091 ns 79151 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 265500 ns 224000 ns 1.19
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222041 ns 254083 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 232667 ns 221708 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 258208 ns 220750 ns 1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 700202.5 ns 347077 ns 2.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26916118 ns 25817148 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 555335 ns 554175.5 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 583 ns 583 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 22789 ns 18626 ns 1.22
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1272891.5 ns 1230354 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 48001 ns 48171 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9083 ns 8917 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9083 ns 9875 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9938 ns 9812.5 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9875 ns 9270.5 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 259037 ns 136039.5 ns 1.90
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24963258.5 ns 29131754 ns 0.86
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 395944 ns 399044 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7875 ns 8042 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10062.5 ns 9312.5 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10520.5 ns 11292 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10583 ns 8334 ns 1.27
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 113441 ns 57821.5 ns 1.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3336239 ns 3436196.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 70210 ns 69710.5 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7833.5 ns 7125 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7625 ns 7791 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7959 ns 7875 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7708 ns 7417 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 472332 ns 255977.5 ns 1.85
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17695066.5 ns 18497059 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 319678 ns 319743 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1375 ns 1.18
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1916 ns 1645.5 ns 1.16
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2020.5 ns 1917 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1500 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 19708 ns 13693.5 ns 1.44
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1164130 ns 1186814 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 189672 ns 188882 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3584 ns 3291 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3750 ns 3479.5 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3792 ns 3625 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3584 ns 3291 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 208603 ns 104102.5 ns 2.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10416766 ns 10382640 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 583536 ns 575736 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148916.5 ns 146979 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 127916 ns 129042 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 130229 ns 129875 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225208 ns 226000 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 22520 ns 17312 ns 1.30
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1193772.5 ns 1216751.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 40501 ns 39935.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 160729.5 ns 159771 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 123458 ns 110521 ns 1.12
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 114792 ns 136250 ns 0.84
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 264249.5 ns 251666.5 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 208808 ns 118480.5 ns 1.76
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10974999 ns 10669966 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 268837.5 ns 265838 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7292 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5959 ns 6041 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5959 ns 6042 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10250 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32455 ns 26774 ns 1.21
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1162105 ns 1208039.5 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50660 ns 48681 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 231563 ns 219937.5 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 235208 ns 227375 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 235250 ns 228667 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214541.5 ns 212729.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 252456 ns 177762.5 ns 1.42
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 25967285 ns 28372083 ns 0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 597316 ns 589856 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12542 ns 15958 ns 0.79
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12833 ns 16208.5 ns 0.79
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14188 ns 16687.5 ns 0.85
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 13625 ns 14792 ns 0.92
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 131390 ns 63622 ns 2.07
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5550478 ns 5760147.5 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 233213 ns 227543 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23666 ns 23916 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23229.5 ns 24500 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24625 ns 23458 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23834 ns 23000 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 815266 ns 431176 ns 1.89
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 40999180 ns 42796325.5 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 686982 ns 675657 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8667 ns 9167 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10208 ns 9834 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10729 ns 11021 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9250 ns 8729.5 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 116988 ns 64004 ns 1.83
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3575157.5 ns 3525023 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 73990 ns 73491 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14459 ns 14292 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13833 ns 13729 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14395.5 ns 14208 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14250 ns 13459 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 625118 ns 323073 ns 1.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 21155361 ns 21480471 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 365734 ns 371404 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8959 ns 8083 ns 1.11
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9709 ns 10416.5 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10854.5 ns 10937.5 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10291 ns 9333 ns 1.10
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 116936.5 ns 66250 ns 1.77
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3357605 ns 3712952 ns 0.90
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 73371 ns 74871 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12458 ns 12708 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12270.5 ns 13020.5 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12937.5 ns 13333.5 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13291 ns 12417 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 515797 ns 286792 ns 1.80
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 18556708 ns 19725639 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 340984 ns 340593.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 27562 ns 29166 ns 0.95
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 33833.5 ns 34604 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 31542 ns 32229.5 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1750 ns 1750 ns 1
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16227 ns 15001 ns 1.08
batchedmm(2, Bsize=128)/forward/GPU/oneAPI 78590127 ns 78965877 ns 1.00
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 81231 ns 86890 ns 0.93
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5291.5 ns 5125 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 4979.5 ns 5062.5 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5416 ns 5167 ns 1.05
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6458 ns 6292 ns 1.03
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 135144 ns 99425.5 ns 1.36
batchedmm(2, Bsize=128)/zygote/GPU/oneAPI 109913428 ns 110379934 ns 1.00
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 370274 ns 383544 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23995 ns 19905 ns 1.21
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1260687.5 ns 1150337 ns 1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 47300 ns 48921 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6417 ns 6292 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6541 ns 6458 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6584 ns 6708 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6708 ns 6208 ns 1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 180494.5 ns 127212 ns 1.42
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 23985458.5 ns 23911059 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 386289 ns 394834 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 1958 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2084 ns 2041 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2166 ns 2042 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2125 ns 1958 ns 1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25421 ns 20510 ns 1.24
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1185083 ns 1241051 ns 0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 206112 ns 210527 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16896 ns 16896 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17000 ns 17125 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18333 ns 16750 ns 1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17292 ns 15875 ns 1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 264717 ns 143434 ns 1.85
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 25260741 ns 25814251 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 702657 ns 704697 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 166125 ns 174541 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 177603.5 ns 147000 ns 1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 148958 ns 152688 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 148917 ns 150916 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 187074 ns 165022.5 ns 1.13
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7946915 ns 7825451.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 226902 ns 226202.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1327854.5 ns 1318770.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1318125 ns 1320292 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1326521.5 ns 1329500 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1295625 ns 1333791.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 844331.5 ns 605687 ns 1.39
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47016714 ns 46439481 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1001545 ns 1061786 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 32583 ns 25084 ns 1.30
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 26000 ns 25042 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 26541.5 ns 27854 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 26124.5 ns 24749.5 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 226484 ns 119756.5 ns 1.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7837953 ns 7727314 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115201 ns 116991 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 131875 ns 131479 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 152250 ns 171708 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 153750 ns 127521 ns 1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 131625 ns 117479 ns 1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 970881 ns 551436.5 ns 1.76
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45298380 ns 45901726 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 614061 ns 610436 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 291 ns 0.86
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 334 ns 292 ns 1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22483 ns 17730.5 ns 1.27
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1212860.5 ns 1203450 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 49500 ns 48751 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6459 ns 6416.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6417 ns 6542 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6750 ns 6833 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6563 ns 6167 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 197111 ns 136341 ns 1.45
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25776994 ns 25006525.5 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 390483 ns 393949 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5750 ns 6666 ns 0.86
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6458 ns 6625 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6979 ns 6917 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5333 ns 5666 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 136326 ns 72501 ns 1.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5775885 ns 5996889 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 234652 ns 233483 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10000 ns 9917 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10334 ns 10062.5 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10500 ns 10250 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10125 ns 9875 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 837095 ns 493431 ns 1.70
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 40912011 ns 41270237 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 671137 ns 675326 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 666 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 667 ns 1.12
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 708 ns 667 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22523 ns 20029 ns 1.12
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2081039 ns 2098576 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 208153 ns 207872.5 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4833 ns 4542 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4917 ns 4625 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5250 ns 4791 ns 1.10
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4625 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 217286 ns 167220.5 ns 1.30
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10433598 ns 9409031.5 ns 1.11
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 579966 ns 577916 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7709 ns 7854 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9042 ns 8875 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10083.5 ns 9750 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9125 ns 7334 ns 1.24
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 114799.5 ns 72767.5 ns 1.58
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3549639 ns 3713250 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 74946 ns 77435.5 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9125 ns 8167 ns 1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8459 ns 8354.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8958 ns 9041 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8417 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 551169 ns 372607.5 ns 1.48
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 20846570.5 ns 21133871 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 344893 ns 345814 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 126604.5 ns 125395.5 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 129541 ns 129042 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 130458 ns 129959 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 182896 ns 180916 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46147 ns 44539 ns 1.04
batchedmm(128, Bsize=4)/forward/GPU/oneAPI 73869604 ns 75228887 ns 0.98
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 104461 ns 100291 ns 1.04
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 341208 ns 310917 ns 1.10
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 327416 ns 313833 ns 1.04
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 345562.5 ns 324083.5 ns 1.07
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 569312.5 ns 600354 ns 0.95
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 183705 ns 150808 ns 1.22
batchedmm(128, Bsize=4)/zygote/GPU/oneAPI 92315110 ns 91943409 ns 1.00
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 502435 ns 502450 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399333 ns 396750 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288167 ns 288145.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288020.5 ns 287583 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755875 ns 756625 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43522.5 ns 40964 ns 1.06
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1347689.5 ns 1391370 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 80731 ns 81511 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1404291.5 ns 1449583.5 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136208 ns 1136667 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1136375 ns 1134771 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2442125 ns 2361041.5 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 242542 ns 207930 ns 1.17
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 9970984 ns 10356148 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 353108.5 ns 355144 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 643667 ns 647292 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 649416 ns 578500 ns 1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 646791.5 ns 639416 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 640271.5 ns 656333 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 184288 ns 154081 ns 1.20
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8427619.5 ns 8771052.5 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 303113 ns 306073.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2480000 ns 2453625 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2441417 ns 2424291 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2445375 ns 2442542 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2435667 ns 2470583 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 927220.5 ns 767152.5 ns 1.21
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 53936788.5 ns 52532777 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1316013 ns 1399204.5 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33875 ns 32604 ns 1.04
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35271 ns 36937.5 ns 0.95
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34937.5 ns 34542 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 917 ns 917 ns 1
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15816 ns 14042 ns 1.13
batchedmm(2, Bsize=32)/forward/GPU/oneAPI 76295890 ns 78232811.5 ns 0.98
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 79581 ns 79530 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3209 ns 3000 ns 1.07
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3291 ns 3084 ns 1.07
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3417 ns 3333 ns 1.03
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3042 ns 3000 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 134276.5 ns 100283 ns 1.34
batchedmm(2, Bsize=32)/zygote/GPU/oneAPI 97067229 ns 96545751 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 337123 ns 337334 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 437667 ns 405958 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 437833 ns 408209 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 438458.5 ns 407958 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 447416.5 ns 421459 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 42161.5 ns 36148.5 ns 1.17
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1435529 ns 1554049.5 ns 0.92
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 241817.5 ns 238757.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4145167 ns 3868375 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4268333 ns 3988562.5 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4271604 ns 3992667 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4025417 ns 3776708.5 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 230700.5 ns 193888 ns 1.19
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 36716035 ns 37305285.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1427924 ns 1433244 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34754 ns 32924.5 ns 1.06
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1243353 ns 1242082 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 40071 ns 37990 ns 1.05
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15458 ns 15708 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 16042 ns 15750 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15875 ns 15958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15625 ns 15500 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 252802 ns 189381 ns 1.33
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8940938 ns 9458441 ns 0.95
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 171532 ns 169642 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404167 ns 404708 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295916 ns 295750 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 296417 ns 295958 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760709 ns 761125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113187 ns 117898 ns 0.96
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1044690.5 ns 1045095 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 89211 ns 87241 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1444875 ns 1478500 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1158416 ns 1159645.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1158333 ns 1158042 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2464875 ns 2384583 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 231034 ns 189114 ns 1.22
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 10580994 ns 9516529.5 ns 1.11
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 352438 ns 351188 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 583 ns 583 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24556 ns 18799 ns 1.31
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1215077.5 ns 1188091 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 207412 ns 205912 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7292 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7916 ns 7583 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7958.5 ns 7917 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7708 ns 7375 ns 1.05
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 202724.5 ns 141427.5 ns 1.43
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 26299646 ns 26708173 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 691927 ns 683937 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 833708.5 ns 833042 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 617667 ns 621208 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 620250 ns 621791 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1558000 ns 1550917 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 134627 ns 134056.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/oneAPI 75767504.5 ns 77301649 ns 0.98
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 232042 ns 227902 ns 1.02
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2690520.5 ns 2692167 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2001666.5 ns 1995500 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2002375 ns 2004812.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4923459 ns 4935000 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 232967 ns 249781 ns 0.93
batchedmm(128, Bsize=32)/zygote/GPU/oneAPI 99203033 ns 100408887.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 768327.5 ns 840633.5 ns 0.91
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 250 ns 291 ns 0.86
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31737 ns 25355 ns 1.25
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1097642 ns 1311449 ns 0.84
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 46990 ns 46990 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6250 ns 6209 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6334 ns 6708.5 ns 0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6542 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6500 ns 6125 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 216848.5 ns 157347 ns 1.38
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22868710.5 ns 21691879 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 363084 ns 365484 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1756083 ns 2366042 ns 0.74
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1773708.5 ns 2395500 ns 0.74
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1731875 ns 2374083 ns 0.73
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1723709 ns 2382167 ns 0.72
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 185580 ns 170643 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8097715 ns 8487051 ns 0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 375774 ns 374764 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4363834 ns 4646208 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4360063 ns 4643687 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4378875 ns 4660250 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4369520.5 ns 4569374.5 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 829356 ns 714837 ns 1.16
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 48033448.5 ns 50175411.5 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1396403.5 ns 1351724 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 7146 ns 7208 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 9833 ns 7084 ns 1.39
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7250 ns 7208 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6875 ns 6833 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 21835 ns 16063.5 ns 1.36
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1202854 ns 1173405 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 40090.5 ns 39030 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 68125 ns 63792 ns 1.07
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 66458.5 ns 32833 ns 2.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 51312.5 ns 45917 ns 1.12
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 32958.5 ns 45229.5 ns 0.73
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 205180 ns 163785 ns 1.25
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10713432 ns 10469728.5 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 269342.5 ns 262653 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 22083.5 ns 20584 ns 1.07
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25042 ns 26208 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 24666.5 ns 23542 ns 1.05
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5583 ns 5125 ns 1.09
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17692 ns 16017 ns 1.10
batchedmm(2, Bsize=512)/forward/GPU/oneAPI 89463574 ns 90340662 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 84500.5 ns 84110 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12041 ns 11791 ns 1.02
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10167 ns 10229.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10584 ns 10625 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17770.5 ns 17895.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 217435 ns 159148 ns 1.37
batchedmm(2, Bsize=512)/zygote/GPU/oneAPI 145607100.5 ns 149555538 ns 0.97
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 372684 ns 367264 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406209 ns 406500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297375 ns 297583 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 297291 ns 297250 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762584 ns 762791 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46433 ns 43249 ns 1.07
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1403980.5 ns 1362482 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 89281 ns 87411 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1428979 ns 1484125.5 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1164271 ns 1167542 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1168292 ns 1161667 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2470833 ns 2387604.5 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 271835 ns 213476 ns 1.27
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 11893591 ns 13925589 ns 0.85
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 378099 ns 377604 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 437000 ns 433583 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 440041 ns 436917 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 438959 ns 436666 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 449417 ns 448291 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 53469 ns 45983 ns 1.16
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1006988 ns 1048211.5 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 234822 ns 234192 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4132333 ns 3894625 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4262646 ns 4022709 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4266645.5 ns 4024624.5 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4029729 ns 3801916.5 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 251074 ns 210260 ns 1.19
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31753545 ns 32692776 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1374018.5 ns 1361238 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9542 ns 8709 ns 1.10
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 8167 ns 7667 ns 1.07
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 8167 ns 7667 ns 1.07
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 13417 ns 12375 ns 1.08
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 23409 ns 20402 ns 1.15
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2209102 ns 2188548.5 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 211472 ns 210772 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 49709 ns 45041 ns 1.10
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 49667 ns 45208 ns 1.10
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 50250 ns 45208 ns 1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 49792 ns 44708 ns 1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 333916 ns 253192 ns 1.32
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 10942581.5 ns 14008146.5 ns 0.78
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 658106.5 ns 653917 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 84687.5 ns 82979 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90459 ns 126104.5 ns 0.72
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85792 ns 86229.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84021 ns 84875 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191047 ns 184626 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5785931 ns 6066708 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 222432 ns 219662 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2027833 ns 2017833 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2014979.5 ns 2016000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2016229.5 ns 2006375 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2015812.5 ns 2025083 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 505179 ns 496955.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28452120 ns 27423881 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1086300 ns 1040810 ns 1.04

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.