Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: urgent patch for reactant breakage
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Oct 4, 2024
1 parent 6aad052 commit e6dd65c
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.1"
version = "1.3.2"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down Expand Up @@ -71,7 +71,7 @@ LinearAlgebra = "1.10"
LoopVectorization = "0.12.171"
LuxCore = "1"
MKL = "0.7"
MLDataDevices = "1.1.1"
MLDataDevices = "1.2"
Markdown = "1.10"
NNlib = "0.9.24"
Octavian = "0.3.28"
Expand Down
2 changes: 1 addition & 1 deletion src/impl/Impl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ using Random: Random, AbstractRNG, rand!
using Statistics: Statistics, mean, var

using LuxCore: LuxCore
using MLDataDevices: get_device_type, CPUDevice, AMDGPUDevice, CUDADevice,
using MLDataDevices: get_device_type, CPUDevice, AMDGPUDevice, CUDADevice, XLADevice,
AbstractGPUDevice, AbstractDevice
using NNlib: NNlib, ConvDims

Expand Down
4 changes: 2 additions & 2 deletions src/impl/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ end

conv(x, weight, cdims::ConvDims) = conv(get_device_type((x, weight)), x, weight, cdims)

function conv(
::Type{<:Union{CPUDevice, CUDADevice, AMDGPUDevice}}, x′, weight′, cdims::ConvDims)
function conv(::Type{<:Union{CPUDevice, CUDADevice, AMDGPUDevice, XLADevice}},
x′, weight′, cdims::ConvDims)
x, weight = get_conv_input_weight(x′, weight′)
return NNlib.conv(x, weight, cdims)
end
Expand Down

3 comments on commit e6dd65c

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/116561

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.2 -m "<description of version>" e6dd65cfbc2313c7a8584fc26af6fc1ceb9bc31d
git push origin v1.3.2

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: e6dd65c Previous: 6aad052 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6104.5 ns 5291 ns 1.15
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6125 ns 7375 ns 0.83
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7166 ns 7687 ns 0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6042 ns 6958 ns 0.87
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 105660 ns 111876 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2718405 ns 2746993 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 401954 ns 414534 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9979 ns 10041.5 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10000 ns 10125 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 10167 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10063 ns 10000.5 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 495391 ns 497187 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 16604973 ns 17740695 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 682487 ns 664206 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1812 ns 1479.5 ns 1.22
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1708 ns 1459 ns 1.17
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1667 ns 1875 ns 0.89
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 2104 ns 1583.5 ns 1.33
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 20067 ns 19698 ns 1.02
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1353411.5 ns 1364290 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 31000 ns 31630 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4041 ns 4083 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3625 ns 4416 ns 0.82
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4542 ns 4125 ns 1.10
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4250.5 ns 3291 ns 1.29
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 133056 ns 130509 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 8127591.5 ns 9003854 ns 0.90
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 146031 ns 149371 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58042 ns 57958 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39959 ns 46167 ns 0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39792 ns 46542 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83333 ns 82541 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36918.5 ns 36502 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 567954 ns 564405 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 76900 ns 81146 ns 0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2030417 ns 2037625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2081666.5 ns 2078416 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2084437 ns 2083625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2002333 ns 2000875 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 220443 ns 216924 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7915184 ns 7524779 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1433294 ns 1725786 ns 0.83
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 146500 ns 152667 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 164208.5 ns 168375 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 150937.5 ns 152437.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 189709 ns 193708 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166381.5 ns 167125 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8085655 ns 7312313 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 187972 ns 213517 ns 0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1113437 ns 1113104.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1109375 ns 1116334 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1117083.5 ns 1115000 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1112084 ns 1106770.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 646028 ns 628256 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34072899 ns 32195104 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1026270 ns 1026645 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6250.5 ns 5166 ns 1.21
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4917 ns 4792 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5562.5 ns 5917 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4708 ns 4542 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 82687 ns 82840 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5327028.5 ns 5343488 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 59005.5 ns 67740 ns 0.87
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8958 ns 8708 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8833 ns 8500 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9167 ns 8708.5 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8875 ns 8542 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 554954 ns 548688 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 33304840 ns 33264338 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 384224 ns 384004 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18208 ns 17709 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22250 ns 18625 ns 1.19
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20500 ns 21375 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17833.5 ns 19583.5 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 62129 ns 61770.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3094668.5 ns 3180292.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 77001 ns 75881 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 234334 ns 212208 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229500 ns 219208.5 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 224000 ns 214875 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219041.5 ns 219958 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 329979.5 ns 324445 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 12487876 ns 13687318.5 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 465894 ns 466224 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 584 ns 625 ns 0.93
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 708 ns 625 ns 1.13
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 958 ns 0.78
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 645.5 ns 667 ns 0.97
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 19107 ns 18677 ns 1.02
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1159471 ns 1223151.5 ns 0.95
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32171 ns 31400 ns 1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1458 ns 1416.5 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1334 ns 1417 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1542 ns 1583 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1375 ns 1416 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 114910.5 ns 114301 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8672258 ns 8986516.5 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 124841 ns 135771 ns 0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7417 ns 7292 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5354.5 ns 6125 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5458 ns 6125 ns 0.89
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 9958 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23654 ns 23537.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1263771.5 ns 1250837 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48941 ns 47255.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 256833 ns 220688 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 269917 ns 235896 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 269000 ns 229416 ns 1.17
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213417 ns 255458.5 ns 0.84
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 184585 ns 180772.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32350348 ns 30816816.5 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 588346 ns 642475 ns 0.92
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4084 ns 4042 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 4084 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4166 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4083 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23536 ns 22833 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2170016 ns 2018204 ns 1.08
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 47570 ns 46910 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16500 ns 16541 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16667 ns 16834 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17042 ns 17084 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16500 ns 16833 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 185621 ns 182565 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 9999690 ns 10544759 ns 0.95
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 171902 ns 171221 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 493500 ns 493041 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 313000 ns 385667 ns 0.81
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 312583 ns 386125 ns 0.81
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 847333 ns 847250 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113322 ns 112997 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 396951 ns 408156.5 ns 0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 242543 ns 242212 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2121250 ns 2093437.5 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1582666 ns 1861958 ns 0.85
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1584000 ns 1876833 ns 0.84
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3043250.5 ns 3143021 ns 0.97
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 230454 ns 228687 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 10440097.5 ns 10334254.5 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 746137 ns 743867 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7000.5 ns 6167 ns 1.14
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6479.5 ns 6625 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6708 ns 8333.5 ns 0.80
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6458 ns 7375 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 83715.5 ns 83073.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5295782.5 ns 5613807.5 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 59480 ns 65621 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12396 ns 11042 ns 1.12
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11500 ns 10958.5 ns 1.05
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12104.5 ns 11645.5 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11333.5 ns 12812.5 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 600141.5 ns 595390 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 39003334 ns 37940094 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 410324 ns 408370.5 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 541 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 541 ns 542 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23331 ns 23168 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2292659.5 ns 2210796 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 51010 ns 46950 ns 1.09
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2084 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2209 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2166 ns 2084 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 233774 ns 213006.5 ns 1.10
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 11024589.5 ns 11081491 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 182892 ns 181582.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8417 ns 8834 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9563 ns 8834 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10021 ns 10021.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8583 ns 8500 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 110268 ns 99705.5 ns 1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3046756 ns 3198646 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 71861 ns 72221 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18042 ns 18834 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18416.5 ns 17479 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 19083.5 ns 18458 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18187.5 ns 18895.5 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 612118 ns 566743 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 16020202 ns 18116750 ns 0.88
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 379663 ns 377315 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34018 ns 33362 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1157894 ns 1254721 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 48210 ns 46210 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9000 ns 8916.5 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9250 ns 9479.5 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9541.5 ns 9667 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9187.5 ns 9250 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 263691 ns 255341 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 17973778.5 ns 18499322 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 363818.5 ns 366854.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399291 ns 398042 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 215375 ns 288250 ns 0.75
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215291 ns 288042 ns 0.75
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756375 ns 755958 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111229 ns 112430 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 340432 ns 338275 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 74750 ns 74831 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1397958 ns 1408834 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 860270.5 ns 1134937.5 ns 0.76
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 859500 ns 1133167 ns 0.76
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2356875 ns 2438875 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 199160 ns 198896 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 10127007.5 ns 10071273 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 325203 ns 320874 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7458.5 ns 7270.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7583 ns 7583 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8250 ns 8854.5 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7188 ns 6917 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 138757.5 ns 136778.5 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5504110 ns 5388548.5 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 59831 ns 66211 ns 0.90
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12708.5 ns 14833.5 ns 0.86
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16250 ns 15791 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16708 ns 14229.5 ns 1.17
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 12250 ns 14709 ns 0.83
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 903568 ns 897273 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 41185117.5 ns 42742507.5 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 426569.5 ns 425150 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25146 ns 27562.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 29875 ns 25583 ns 1.17
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 29563 ns 30228.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 28708 ns 28854 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 186563 ns 185009 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7437253 ns 7764877.5 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 112512 ns 115321 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 158917 ns 106417 ns 1.49
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 155729 ns 151645.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 147416.5 ns 153166 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 143875 ns 150583 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1016648 ns 996849 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42077608 ns 42717639 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 580615 ns 587287 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74583 ns 74375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75291 ns 75999.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 84145.5 ns 80375 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 80750 ns 87000 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 192007 ns 189182 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7668059 ns 7755648 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 121601 ns 128191.5 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 303292 ns 295291 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 318458 ns 319708 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 310583.5 ns 247791.5 ns 1.25
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 286500 ns 273792 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1028367 ns 1010749 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 39874880 ns 41903716 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 694997 ns 697424 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13208 ns 13500 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13209 ns 13209 ns 1
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14416.5 ns 14167 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12583 ns 13125 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 137690 ns 136045 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5469791 ns 5580516 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 235293 ns 233743 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25916.5 ns 26604 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26042 ns 26187.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27125 ns 26625 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27750 ns 28208.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 917440.5 ns 900814 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 41204925.5 ns 41017058 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 677137 ns 690428 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11021.5 ns 12000 ns 0.92
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12104 ns 11896 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12667 ns 12459 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11084 ns 10833 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 118805.5 ns 117378.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3655788 ns 3507903 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 238257.5 ns 236503 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22625 ns 22417 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 23354.5 ns 22958 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 23500 ns 23875 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 23125 ns 22583 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 678428 ns 660570 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 21333771 ns 20618992 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 679757 ns 675828 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 66333 ns 64666 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 64583.5 ns 67083.5 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 68500 ns 66167 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 64792 ns 66875 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 101302 ns 100086 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3333715 ns 3307399.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 234893 ns 234107.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 486625 ns 465000 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 486083 ns 466167 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 478646 ns 468625 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 464625 ns 503833 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 490708 ns 483663 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20883033 ns 21199224 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 709767 ns 709238 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7562.5 ns 7562.5 ns 1
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7875 ns 8083 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8500 ns 8250 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7292 ns 7083.5 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 136584.5 ns 134375 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5472736.5 ns 5976128 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 57580 ns 65651 ns 0.88
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14459 ns 14041.5 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14417 ns 13125 ns 1.10
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14625 ns 14479 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16625 ns 14625 ns 1.14
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 882666 ns 872555 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 38063609 ns 40293875 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 396884 ns 400284 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6159458 ns 6157812.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 3225666 ns 6375333.5 ns 0.51
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 3225333 ns 6376917 ns 0.51
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11918958 ns 11913125 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 345241.5 ns 346601.5 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/oneAPI 49786188 ns 53593217 ns 0.93
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 301508 ns 320474 ns 0.94
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19144854.5 ns 19110896 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 11111958.5 ns 19977396 ns 0.56
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 11126458 ns 19903104 ns 0.56
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36537562.5 ns 36496187.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1009913 ns 1012562 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/oneAPI 79258291 ns 77852170.5 ns 1.02
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1164436.5 ns 1157544 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1083 ns 1000 ns 1.08
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1125 ns 1000 ns 1.13
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1042 ns 1042 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1041 ns 958 ns 1.09
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23469 ns 22944 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2016774.5 ns 2044697 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 209702 ns 207642 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4000 ns 3917 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4000 ns 4000 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4000 ns 4042 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4000 ns 4000 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 270402 ns 269119 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 12755226.5 ns 11661739 ns 1.09
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 624936 ns 625997 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7896 ns 8437.5 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7624.5 ns 8895.5 ns 0.86
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9041 ns 9562 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8792 ns 8375 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 116551 ns 113535 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3381124 ns 3443497.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 67301 ns 68271 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 12375 ns 11896 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12354.5 ns 12021 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 13458 ns 12792 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11521 ns 12583 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 608379 ns 597497 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 23562453 ns 21602127 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 355544 ns 354444 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 375 ns 291 ns 1.29
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22683 ns 22361 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2066903.5 ns 1916584 ns 1.08
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 48621 ns 46890 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2917 ns 3000 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3000 ns 2958 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3458 ns 3333 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2917 ns 2917 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 194883.5 ns 193738 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9192914 ns 9462333 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 160881 ns 156212 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11833 ns 10583 ns 1.12
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11771 ns 11875 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12666 ns 13083.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11708 ns 12062.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 114987 ns 113976.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3433103 ns 3275659 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 237082 ns 236063 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 22270.5 ns 21833.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23625 ns 22145.5 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23145.5 ns 23875 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 22417 ns 22333 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 559620 ns 547934 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 22188886 ns 20491745 ns 1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 657467.5 ns 654438 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4417 ns 4375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4417 ns 4458 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 23954 ns 23860 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2173917 ns 2144860.5 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 47821 ns 49061 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16375 ns 16375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16375 ns 16666 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16500 ns 16666 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16250 ns 16541 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 319321 ns 316685 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12527061 ns 12062386.5 ns 1.04
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 205182 ns 209243 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2209 ns 2000 ns 1.10
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2208 ns 2084 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2209 ns 2209 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2084 ns 2208 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34739 ns 34477 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1189190 ns 1229094 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 207283 ns 203202 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 17729.5 ns 18604 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 19291.5 ns 18708 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 19125 ns 18833.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17500 ns 21208.5 ns 0.83
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 284503 ns 282309 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 23993645 ns 21098361 ns 1.14
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 683047 ns 686013 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 58771 ns 59292 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 61500 ns 64917 ns 0.95
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 62167 ns 66458 ns 0.94
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51041 ns 51625 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66683 ns 66488 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/oneAPI 87104215 ns 88258686 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 96771 ns 118491 ns 0.82
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 189875 ns 175916.5 ns 1.08
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 148499.5 ns 153479 ns 0.97
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 141104 ns 160333.5 ns 0.88
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 271312 ns 224542 ns 1.21
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 208001 ns 208290.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/oneAPI 151028820 ns 149475929.5 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 556366 ns 608982 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 83188 ns 81083 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 116270.5 ns 83270.5 ns 1.40
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 87667 ns 124833.5 ns 0.70
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 88791 ns 85395.5 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190555.5 ns 192029 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5413890 ns 5900244 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 168726.5 ns 202972.5 ns 0.83
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1885521 ns 1881145.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1906833 ns 1912667 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1922167 ns 1916083 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1922208.5 ns 1849250 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 505315 ns 499932 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 25531763 ns 26802673 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 918625.5 ns 1066872 ns 0.86
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21748.5 ns 21422.5 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2173850 ns 2063314.5 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 40920 ns 41850 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1834 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 243459 ns 241279.5 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 10131447 ns 9975087 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 176522 ns 180262 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 11042 ns 8166 ns 1.35
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9834 ns 10292 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11166.5 ns 11208 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9417 ns 11042 ns 0.85
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 115799 ns 113299.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3304623 ns 3500381.5 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 235862 ns 233333 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9916 ns 9917 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 11000 ns 9834 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10437.5 ns 11458 ns 0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9625 ns 10417 ns 0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 492386 ns 484445 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19770322 ns 18749564 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 634956.5 ns 627157 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58666 ns 58375 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39500 ns 47209 ns 0.84
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39333 ns 46833 ns 0.84
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83750 ns 82625 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38435 ns 38276 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1328229 ns 1341940 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79261 ns 75211 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1932333.5 ns 1836770.5 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1949916 ns 1985937.5 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1971250 ns 1978479 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1900375 ns 1854291.5 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 211772 ns 209126 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 38871327.5 ns 33357124 ns 1.17
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1010796 ns 1011361 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 276583 ns 267437.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 268541 ns 270417 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 270583.5 ns 270625 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 269542 ns 268604.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 196349 ns 193011.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7271576 ns 7986425 ns 0.91
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 281833 ns 282544 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 662208 ns 588125 ns 1.13
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 709250 ns 688229.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 685042 ns 688292 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 690770.5 ns 593500 ns 1.16
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 994716 ns 985216 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45571380 ns 43272459 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 902690 ns 911561 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2181125 ns 2205542 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2197167 ns 2194083.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2214166 ns 2213708 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2217666 ns 2176167 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 156988.5 ns 153511 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8339035 ns 8157200 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 421825 ns 445380 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5477291.5 ns 5508979.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5530250 ns 5521979 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5519334 ns 5474458 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5543313 ns 5531895.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 938151 ns 925959.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 53140221 ns 50527002 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1722729 ns 1539832.5 ns 1.12
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 478167 ns 478666 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 257208 ns 346145.5 ns 0.74
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 257292 ns 346083 ns 0.74
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 908750 ns 909333 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46532.5 ns 46203 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 824635.5 ns 382606 ns 2.16
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 246353 ns 242913 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2133375 ns 2111749.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1588083 ns 1861166.5 ns 0.85
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1587417 ns 1866541 ns 0.85
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3041125 ns 3130375 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 256675 ns 258500 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 12946074 ns 15052922 ns 0.86
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 775668 ns 773039 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58000 ns 58125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39625 ns 46334 ns 0.86
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39375 ns 46167 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83500 ns 82542 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 27930.5 ns 27952 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1395751 ns 1310631 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 73260 ns 73681 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2017271 ns 2039458 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2083062.5 ns 2089729.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2080584 ns 2087020.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1994312.5 ns 1978124.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 224353 ns 221951 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 36423844 ns 36802380 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1036751 ns 1041362 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58292 ns 58417 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39917 ns 46958 ns 0.85
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39750 ns 46417 ns 0.86
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83458 ns 82334 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48290 ns 47697.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 796160 ns 816463 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 69781 ns 71371 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1920208 ns 1926479.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1966666.5 ns 1973250 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1956354.5 ns 1973167 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1892750 ns 1898833 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 231868 ns 228428 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 17847168 ns 17513790 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 917180 ns 1026836.5 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 334 ns 0.87
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 33423 ns 33167 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1183579 ns 1174385.5 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 47961 ns 48501 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6750 ns 6416 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6625 ns 6834 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6916 ns 7250 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6542 ns 6333 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 205663 ns 199581 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 20110108 ns 19880225 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 364303.5 ns 363764 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 333 ns 250 ns 1.33
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31975 ns 32517 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1205451 ns 1265101 ns 0.95
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 40370 ns 37771 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3667 ns 3417 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3625 ns 3375 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3209 ns 3167 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 3250 ns 2792 ns 1.16
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 182875 ns 182053 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7414501 ns 9212477.5 ns 0.80
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 146242 ns 158127 ns 0.92
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 468625 ns 460687.5 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 492396 ns 478208.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 470250 ns 500000 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 466354 ns 470937 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134348 ns 134071 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5645574.5 ns 5855749 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 349229 ns 366189 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4091499.5 ns 4078667 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4078417 ns 4067771 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4081499.5 ns 4080625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4051646 ns 4056354 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 673570.5 ns 664164.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33417417 ns 31731318 ns 1.05
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1482381 ns 1467136 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49972812 ns 49955792 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 26026291 ns 35488958 ns 0.73
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 25991500 ns 35531584 ns 0.73
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97072458 ns 97090437.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1599973.5 ns 1601101.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/oneAPI 56493457 ns 55729446 ns 1.01
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1057326.5 ns 1044391.5 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154932104.5 ns 154677542 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 89308062.5 ns 112413020.5 ns 0.79
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 88895875 ns 112347584 ns 0.79
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 295925812.5 ns 295444937.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6475879 ns 6489665.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/oneAPI 126118434 ns 124609188 ns 1.01
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5578679 ns 5586056.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 18917 ns 19520.5 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 16000 ns 17458 ns 0.92
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 13708 ns 17417 ns 0.79
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 16437.5 ns 15750 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19926 ns 19821 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1208215.5 ns 1180885 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 27550 ns 26420 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10937 ns 10959 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 7770.5 ns 9125.5 ns 0.85
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 7708 ns 9084 ns 0.85
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17291 ns 17167 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 243495.5 ns 242736.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 9761127 ns 10064346.5 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 147112 ns 149096.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8750 ns 8208.5 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9708.5 ns 10687.5 ns 0.91
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10667 ns 10604.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8646 ns 8959 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 119480.5 ns 116211.5 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3419715 ns 3615906 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 237342 ns 234342 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10312.5 ns 10292 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11041 ns 10209 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10667 ns 11292 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10770.5 ns 9437.5 ns 1.14
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 585828 ns 575593 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 23041802 ns 22955140 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 655982 ns 652487 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10020.5 ns 9250 ns 1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9333 ns 9875 ns 0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10396 ns 11041.5 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9500 ns 10333.5 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 115334.5 ns 113252 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3516489.5 ns 3518835.5 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 70430.5 ns 72611 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 15292 ns 16542 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 17375 ns 15833 ns 1.10
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 15542 ns 15750 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16250 ns 16708 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 558960.5 ns 548922 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19523647 ns 19847936.5 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 346234 ns 343724 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 584 ns 584 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 33420.5 ns 33202 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1147462 ns 1238450.5 ns 0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 208233 ns 204092 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 8916 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8917 ns 9292 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9375 ns 9958 ns 0.94
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8125 ns 12292 ns 0.66
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 223663.5 ns 220226.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 20525508 ns 21905010 ns 0.94
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 660067.5 ns 657387 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 15833 ns 17625 ns 0.90
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 14958 ns 15958 ns 0.94
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 13166.5 ns 15209 ns 0.87
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12042 ns 11291 ns 1.07
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 20351 ns 19970 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1180410 ns 1162812 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 188642 ns 188782 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 35334 ns 35458 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 35396 ns 35562 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 35354.5 ns 35645.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 35459 ns 35542 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 258908.5 ns 255892 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11202018 ns 10845224.5 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 593676 ns 591957 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 453584 ns 448958 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 448854.5 ns 453750 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 458979 ns 492166 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 463708 ns 453875 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194627 ns 193846 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5838406 ns 6007739 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 361629 ns 367744 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4069291 ns 4069208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4057666 ns 4054291.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4066166.5 ns 4049270.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4041000 ns 4057500 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 509044 ns 505408 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28858353.5 ns 37330396 ns 0.77
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1369935 ns 1362695 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 786136291 ns 779601166 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 416023146 ns 542496166 ns 0.77
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 416822792 ns 539989666 ns 0.77
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1513689687.5 ns 1569938708 ns 0.96
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22552578.5 ns 22536712.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/oneAPI 184723934 ns 187859757.5 ns 0.98
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14622705 ns 14732780 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2527797917 ns 2505560125 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1507508250 ns 1783555333 ns 0.85
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1513719042 ns 1792629375 ns 0.84
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4744640792 ns 5216869375 ns 0.91
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 119636395 ns 118336848 ns 1.01
batchedmm(512, Bsize=512)/zygote/GPU/oneAPI 977281580 ns 935397218 ns 1.04
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87882829 ns 88936600 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 78083.5 ns 78854.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 79375 ns 76791 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 79292 ns 79000 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77417 ns 79354 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 195081 ns 190682.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 5838980 ns 5473671 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 106236.5 ns 108351 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 291584 ns 294125 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232333.5 ns 289958 ns 0.80
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 275646 ns 261417 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 268875 ns 238520.5 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 999623 ns 986968 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45105321.5 ns 46526863 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 637827 ns 636237 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199983542 ns 199699479 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 103920208 ns 139060584 ns 0.75
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 103978083 ns 139030750 ns 0.75
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389299042 ns 388620875 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5843844.5 ns 5814292 ns 1.01
batchedmm(512, Bsize=128)/forward/GPU/oneAPI 83097022 ns 80005938 ns 1.04
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3606828 ns 3574368 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 620238542 ns 621021958 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 353393416.5 ns 439183125 ns 0.80
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 352881646 ns 439329875 ns 0.80
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1193561791 ns 1194801708 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26518526 ns 26594102 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/oneAPI 284158390 ns 295168041 ns 0.96
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22094133 ns 22131978 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7291 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5375 ns 6291 ns 0.85
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 6125 ns 0.88
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9875 ns 9959 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26733.5 ns 26445 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1222239 ns 1270170 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46490 ns 48281 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220979 ns 216209 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 224417 ns 220416.5 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223500 ns 222625 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207583 ns 219125 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 215495 ns 214078.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20243976 ns 29452909 ns 0.69
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 519876 ns 522765 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 10312.5 ns 9416.5 ns 1.10
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9479 ns 9041 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9895.5 ns 9833.5 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9937.5 ns 10791.5 ns 0.92
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 113347 ns 110026 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3285929 ns 3375913.5 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 71090 ns 72150 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9604 ns 10854.5 ns 0.88
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 11437.5 ns 7750 ns 1.48
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10042 ns 9708 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10145.5 ns 11250 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 491382 ns 484552.5 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19298925 ns 18934737 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 314464 ns 313639 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 708 ns 459 ns 1.54
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 709 ns 500 ns 1.42
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24930.5 ns 24574 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1200126 ns 1221655.5 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 48911 ns 46721 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12375 ns 12292 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14958 ns 8896 ns 1.68
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9000 ns 10583 ns 0.85
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9666 ns 13666 ns 0.71
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 246496 ns 243553 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 26158764.5 ns 23025269 ns 1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 386995 ns 386704 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 110750 ns 111083 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 90417 ns 102541.5 ns 0.88
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 88125 ns 103792 ns 0.85
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 155146 ns 155083.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 23300 ns 22624 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 807719.5 ns 791164 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 190702 ns 191512 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 534625 ns 567500 ns 0.94
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 562249.5 ns 573417 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 542812.5 ns 549583.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 535250 ns 537292 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 217557.5 ns 213930 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11916876 ns 11700863.5 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 610017 ns 608337 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5375 ns 5750 ns 0.93
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 6709 ns 5167 ns 1.30
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7375 ns 7667 ns 0.96
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 6520.5 ns 4563 ns 1.43
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17156 ns 16559 ns 1.04
batchedmm(16, Bsize=32)/forward/GPU/oneAPI 73303180 ns 73950991 ns 0.99
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 71171 ns 80275.5 ns 0.89
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 12833 ns 11958 ns 1.07
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 11375 ns 10750 ns 1.06
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 10145.5 ns 11583 ns 0.88
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 16708.5 ns 18167 ns 0.92
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 204040 ns 203546 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/oneAPI 100040684 ns 98437217 ns 1.02
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 364443 ns 367244 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 38834 ns 38958 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 50542 ns 51125 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 51417 ns 52458 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13854.5 ns 13770.5 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA 21940 ns 20666.5 ns 1.06
batchedmm(16, Bsize=128)/forward/GPU/oneAPI 77344300 ns 77382892 ns 1.00
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 84996 ns 87361 ns 0.97
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 36917 ns 36416 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 31042 ns 30770.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 28125 ns 35250 ns 0.80
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 77979.5 ns 58812.5 ns 1.33
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 180753 ns 180756 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/oneAPI 114626839.5 ns 110794943 ns 1.03
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 397599 ns 408754 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1854.5 ns 1750 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1958 ns 1875 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2209 ns 2125 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1666.5 ns 1833 ns 0.91
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 19375 ns 19320 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1256628 ns 1202099 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 27490 ns 33080 ns 0.83
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2208 ns 2395.5 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2333 ns 0.93
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2416 ns 2375 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2375 ns 0.89
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 194356 ns 193868.5 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 9435961.5 ns 9197836 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 136311 ns 137016.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5166.5 ns 5292 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5520.5 ns 5750 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6396 ns 6208 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5187.5 ns 5958.5 ns 0.87
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 140899.5 ns 139204.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5837469 ns 5728892 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 57270 ns 69071 ns 0.83
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9020.5 ns 8667 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9437.5 ns 8625 ns 1.09
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 8792 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8417 ns 9145.5 ns 0.92
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 815402.5 ns 809144 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 41500822.5 ns 39925856 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388544 ns 387074 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55083 ns 55125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 54292 ns 55958 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 54375 ns 56042 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 56417 ns 56208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36794 ns 35813.5 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1165062 ns 1246242 ns 0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 206892 ns 202752 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 478792 ns 489125 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 535375 ns 532541.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 496937 ns 505645.5 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 474395.5 ns 470521 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 257604 ns 253767 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27299962 ns 26667416 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 810628 ns 833929 ns 0.97
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3331771 ns 3319083.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1763000 ns 2337292 ns 0.75
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 1769417 ns 2337917 ns 0.76
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6317646 ns 6313500 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204848.5 ns 204383 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/oneAPI 81259709 ns 80623182 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 209783 ns 213737 ns 0.98
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11521375.5 ns 11497229 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 6550500 ns 8328208.5 ns 0.79
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 6561792 ns 8338541.5 ns 0.79
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21242604 ns 21078124.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 741852 ns 737191.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/oneAPI 122682886.5 ns 126245472 ns 0.97
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1060031 ns 1058001 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6292 ns 4750 ns 1.32
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5666 ns 6875 ns 0.82
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7042 ns 6874.5 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5209 ns 6791.5 ns 0.77
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 132073.5 ns 130168 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5592627 ns 5745155 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 54021 ns 56791 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10375 ns 7333 ns 1.41
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9584 ns 7312.5 ns 1.31
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7417 ns 9042 ns 0.82
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7667 ns 7375 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 718413.5 ns 712471 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 35185062 ns 35913527 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 375894 ns 368228.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144542 ns 150042 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 124479.5 ns 93750 ns 1.33
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 101625 ns 126666 ns 0.80
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 150583 ns 97708 ns 1.54
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 148583.5 ns 148678 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6180627 ns 5748457.5 ns 1.08
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 182281 ns 203522 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2030666.5 ns 2036375 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2034833.5 ns 2027000.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2034166.5 ns 2032104 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2024125 ns 2023625 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 674148 ns 663877 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31102723.5 ns 33751272 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1114502 ns 1110211 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 32917 ns 34208 ns 0.96
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 35208 ns 36458 ns 0.97
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 33334 ns 36083 ns 0.92
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 645.5 ns 708 ns 0.91
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15722 ns 15530 ns 1.01
batchedmm(2, Bsize=4)/forward/GPU/oneAPI 72749276 ns 73822262.5 ns 0.99
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 79041 ns 78911 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 3208 ns 2542 ns 1.26
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3958 ns 2833.5 ns 1.40
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3084 ns 3500 ns 0.88
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2333 ns 2209 ns 1.06
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 136962.5 ns 136004.5 ns 1.01
batchedmm(2, Bsize=4)/zygote/GPU/oneAPI 93442257 ns 93721653 ns 1.00
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 340914 ns 339263 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7250 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5417 ns 6084 ns 0.89
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5333 ns 6083 ns 0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10208 ns 10125 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35974 ns 35188 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1275721 ns 1207919 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50280 ns 48090 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215209 ns 244041.5 ns 0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228896 ns 227416.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220729.5 ns 224625 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 205917 ns 216417 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 240303 ns 236066 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26463549 ns 28254417 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 519340 ns 573176 ns 0.91
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21966 ns 21615 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2150661 ns 2072507 ns 1.04
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42521 ns 42031 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14709 ns 14666 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14792 ns 14958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14834 ns 14916.5 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14708 ns 14917 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 299460 ns 297040 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11041214 ns 11259133 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 188891.5 ns 188487 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 128584 ns 120896 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 128208 ns 103687.5 ns 1.24
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 106604 ns 130792 ns 0.82
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 119354 ns 100583 ns 1.19
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132553 ns 149147 ns 0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5647153 ns 6201158 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 183902 ns 204362 ns 0.90
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1924833.5 ns 1925625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1932167 ns 1922584 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1926479 ns 1924687.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1925542 ns 1918000 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 662628 ns 656144 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31206115.5 ns 29883253.5 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1065881 ns 1218242.5 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17958 ns 19042 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18625 ns 21375 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20812 ns 20375 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19584 ns 18625 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104706.5 ns 102936.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3361468 ns 3227536 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81176 ns 80560.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217417 ns 216541.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 265209 ns 239771 ns 1.11
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222291 ns 223709 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 222917 ns 247021 ns 0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 497576 ns 493608 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 19289828.5 ns 19781802 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 466715 ns 479335 ns 0.97
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 24687 ns 24792 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 29083 ns 30625 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 27250 ns 29334 ns 0.93
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1417 ns 1291 ns 1.10
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16449.5 ns 15803 ns 1.04
batchedmm(16, Bsize=4)/forward/GPU/oneAPI 73049342 ns 73776659 ns 0.99
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 80571 ns 81571 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4729.5 ns 4770.5 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5917 ns 5125 ns 1.15
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5459 ns 5396 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4875 ns 4500 ns 1.08
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 201398 ns 200310 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/oneAPI 95411769 ns 93984240 ns 1.02
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 373024 ns 379744 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 223084 ns 226292 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 223479.5 ns 223000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 225458.5 ns 224167 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 222541 ns 224042 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 220423 ns 218225 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7765235 ns 7741481.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 274373 ns 274597.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 497687.5 ns 510000 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 497958 ns 507812.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 501646 ns 554604 ns 0.90
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 507125 ns 496958 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1033721 ns 1023666.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42343394 ns 43947953 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 858214 ns 871339 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20625 ns 21084 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22500 ns 19896 ns 1.13
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21791 ns 21792 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20042 ns 20375 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 112240 ns 110346 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3510862 ns 3666139 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 77390 ns 79030 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213084 ns 212625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218104.5 ns 220750 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219292 ns 216750 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217125 ns 215292 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 716111 ns 706350 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24489616 ns 24785967 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 532795 ns 536195 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6708 ns 6292 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7416 ns 6500 ns 1.14
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8166 ns 8667 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6791 ns 7083 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 133925.5 ns 133065.5 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5628647.5 ns 5772071 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 65140 ns 65491 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9709 ns 10416 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12458 ns 9750 ns 1.28
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11125 ns 10291 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10583 ns 10959 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 779907 ns 769680.5 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 38144455 ns 39162885 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 379434 ns 386734 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7250 ns 5333 ns 1.36
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5250 ns 5792 ns 0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6834 ns 7416 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4917 ns 6833 ns 0.72
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 135559.5 ns 135171 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5624998 ns 5588783 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 56400 ns 68351 ns 0.83
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7666 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7792 ns 7375 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7875 ns 7646 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7625 ns 7458 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 742169 ns 735742.5 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 39269500.5 ns 40095077.5 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 389854 ns 390854 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14503334 ns 14524375 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 7723249.5 ns 10107916 ns 0.76
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 7705416.5 ns 10121667 ns 0.76
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27810125 ns 27752459 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 535378 ns 528552 ns 1.01
batchedmm(128, Bsize=512)/forward/GPU/oneAPI 95953628 ns 98194699 ns 0.98
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 390439 ns 402574 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46519500 ns 46533666.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 26614709 ns 33493083 ns 0.79
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 26530062.5 ns 33509291 ns 0.79
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85657500 ns 85143125 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2847450.5 ns 2860259.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/oneAPI 193094633 ns 194954753.5 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3284834 ns 3304313.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 68958 ns 67916 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 69084 ns 66833.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 68500 ns 69271 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 68166 ns 70875 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104098 ns 101002.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3588750.5 ns 3722652 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 232172 ns 234933 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 480417 ns 480708.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 475791 ns 482250 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 474812.5 ns 478395.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 481041.5 ns 469167 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 714971 ns 703449 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27636818 ns 28256303 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 793828 ns 791863.5 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 750 ns 583 ns 1.29
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32749 ns 32037 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1171475 ns 1235389 ns 0.95
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 49671 ns 49701 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9875 ns 8959 ns 1.10
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9875 ns 8667 ns 1.14
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9375 ns 9562.5 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9208 ns 8250 ns 1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 282467 ns 277434 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21597763 ns 22231771.5 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 373314 ns 376664 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9708 ns 9625 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9708 ns 9708 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9625 ns 9667 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9666 ns 9625 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23485 ns 23071 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2051864 ns 2068828 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 211472 ns 210172 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 50208 ns 50291 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 50042 ns 50417 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 50709 ns 50875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 50209 ns 50583 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 277646 ns 274332 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11587139 ns 12829198 ns 0.90
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 614117 ns 609897 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55291 ns 55125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 54458 ns 55875 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 54334 ns 55917 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 56458 ns 55917 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28038.5 ns 27736 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1177501 ns 1172928.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 206412 ns 203487.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 479020.5 ns 530167 ns 0.90
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 525042 ns 505208 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 499937 ns 508854.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 462667 ns 467249.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 240355 ns 234697 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 33623204.5 ns 33825831.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 838988 ns 884779 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 609500 ns 652145.5 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 661417 ns 645666 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 659375 ns 651250 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 653812.5 ns 641645.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192690.5 ns 186754 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8134102 ns 8762924.5 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 262482 ns 301613 ns 0.87
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2226104 ns 2247709 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2247458 ns 2260312.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2238104 ns 2234500 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2244458.5 ns 2220417 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 927304 ns 904539 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 49609819 ns 49644380 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1364114 ns 1208692 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20208 ns 21125 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22354.5 ns 23708.5 ns 0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22167 ns 22229 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19375 ns 22083 ns 0.88
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 109169 ns 106895 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3430083 ns 3512219 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 77150.5 ns 79091 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222958 ns 265500 ns 0.84
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220604.5 ns 222041 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 227521 ns 232667 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225417 ns 258208 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 712641 ns 700202.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26229825 ns 26916118 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 558770.5 ns 555335 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 583 ns 583 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 584 ns 583 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23081 ns 22789 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1191881 ns 1272891.5 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 48321 ns 48001 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9208.5 ns 9083 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9250 ns 9083 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10666 ns 9938 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9791.5 ns 9875 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 263338 ns 259037 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 25373530 ns 24963258.5 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 399114 ns 395944 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 10500 ns 7875 ns 1.33
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8770.5 ns 10062.5 ns 0.87
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10499.5 ns 10520.5 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10083 ns 10583 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 115864 ns 113441 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3318539 ns 3336239 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 68530 ns 70210 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7917 ns 7833.5 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7625 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 7959 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7875 ns 7708 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 487126 ns 472332 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 16968946 ns 17695066.5 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 322433 ns 319678 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1708 ns 1625 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1667 ns 1916 ns 0.87
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2125 ns 2020.5 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1541 ns 1583 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 19744 ns 19708 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1156033 ns 1164130 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 191542 ns 189672 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3584 ns 3584 ns 1
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3708.5 ns 3750 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3937.5 ns 3792 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3625 ns 3584 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 212174.5 ns 208603 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10575153 ns 10416766 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 580786 ns 583536 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 147562.5 ns 148916.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 106562 ns 127916 ns 0.83
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 107333 ns 130229 ns 0.82
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225583 ns 225208 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 23301 ns 22520 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1161742 ns 1193772.5 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 34030 ns 40501 ns 0.84
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 160417 ns 160729.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 87959 ns 123458 ns 0.71
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 100250 ns 114792 ns 0.87
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 252167 ns 264249.5 ns 0.95
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 211748 ns 208808 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10753069 ns 10974999 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 214182 ns 268837.5 ns 0.80
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7333 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 5959 ns 0.89
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5250 ns 5959 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10417 ns 10042 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33560.5 ns 32455 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1264867.5 ns 1162105 ns 1.09
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50310 ns 50660 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 253958.5 ns 231563 ns 1.10
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 253021.5 ns 235208 ns 1.08
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 235708 ns 235250 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212792 ns 214541.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 260417 ns 252456 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27289933 ns 25967285 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 524496 ns 597316 ns 0.88
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12375 ns 12542 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12583 ns 12833 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13896 ns 14188 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12792 ns 13625 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 134512.5 ns 131390 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5610287 ns 5550478 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 235902 ns 233213 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23959 ns 23666 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24479.5 ns 23229.5 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25291 ns 24625 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24583 ns 23834 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 831522 ns 815266 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 40443315 ns 40999180 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 684542 ns 686982 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9708 ns 8667 ns 1.12
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9917 ns 10208 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11625 ns 10729 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9209 ns 9250 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 120339 ns 116988 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3402718 ns 3575157.5 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 72241 ns 73990 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13750 ns 14459 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14187.5 ns 13833 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15083.5 ns 14395.5 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14084 ns 14250 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 638601 ns 625118 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 21632295 ns 21155361 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 363914 ns 365734 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9208.5 ns 8959 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10000.5 ns 9709 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11166 ns 10854.5 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10167 ns 10291 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 118694 ns 116936.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3312719 ns 3357605 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 72320 ns 73371 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13208.5 ns 12458 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13020.5 ns 12270.5 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13396 ns 12937.5 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12292 ns 13291 ns 0.92
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 529419 ns 515797 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19314666 ns 18556708 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 342414 ns 340984 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 30416.5 ns 27562 ns 1.10
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 33666.5 ns 33833.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 30542 ns 31542 ns 0.97
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1917 ns 1750 ns 1.10
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16576 ns 16227 ns 1.02
batchedmm(2, Bsize=128)/forward/GPU/oneAPI 80979304 ns 78590127 ns 1.03
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 77461 ns 81231 ns 0.95
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5291.5 ns 5291.5 ns 1
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 4896 ns 4979.5 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5291.5 ns 5416 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6417 ns 6458 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 137601 ns 135144 ns 1.02
batchedmm(2, Bsize=128)/zygote/GPU/oneAPI 111031730 ns 109913428 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 379919 ns 370274 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24898 ns 23995 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1219018.5 ns 1260687.5 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 49280 ns 47300 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6750 ns 6417 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6500 ns 6541 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6916.5 ns 6584 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6667 ns 6708 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 184245 ns 180494.5 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 22834153.5 ns 23985458.5 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 386844 ns 386289 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2125 ns 2000 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2167 ns 2084 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2084 ns 2166 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2083 ns 2125 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25661 ns 25421 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1228045 ns 1185083 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 208752 ns 206112 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17250 ns 16896 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17292 ns 17000 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18584 ns 18333 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18416.5 ns 17292 ns 1.07
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 269097.5 ns 264717 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 26227590 ns 25260741 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 693937 ns 702657 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 150875 ns 166125 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 177416.5 ns 177603.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 153625 ns 148958 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 157791 ns 148917 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191062 ns 187074 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7761294 ns 7946915 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 174992 ns 226902 ns 0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1338521 ns 1327854.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1328479 ns 1318125 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1328250 ns 1326521.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1330083.5 ns 1295625 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 866603 ns 844331.5 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 46656075.5 ns 47016714 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1114201.5 ns 1001545 ns 1.11
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 26208.5 ns 32583 ns 0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 29479.5 ns 26000 ns 1.13
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27062.5 ns 26541.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24833 ns 26124.5 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 228889.5 ns 226484 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7347865 ns 7837953 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 116211 ns 115201 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 117584 ns 131875 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 140791 ns 152250 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 126021 ns 153750 ns 0.82
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 119916.5 ns 131625 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 992184 ns 970881 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 46564017 ns 45298380 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 594546 ns 614061 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 334 ns 250 ns 1.34
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 334 ns 334 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23038 ns 22483 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1228991.5 ns 1212860.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 49341 ns 49500 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6833 ns 6459 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6604 ns 6417 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7042 ns 6750 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6791 ns 6563 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 200303.5 ns 197111 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25134864 ns 25776994 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388994 ns 390483 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6375 ns 5750 ns 1.11
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5875 ns 6458 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7812.5 ns 6979 ns 1.12
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6458 ns 5333 ns 1.21
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 139406.5 ns 136326 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5798964 ns 5775885 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 235513 ns 234652 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10083.5 ns 10000 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10167 ns 10334 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10417 ns 10500 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9959 ns 10125 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 853447 ns 837095 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 40259621 ns 40912011 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 676147 ns 671137 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 750 ns 667 ns 1.12
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 750 ns 667 ns 1.12
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 750 ns 0.89
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 750 ns 708 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23007 ns 22523 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2046355.5 ns 2081039 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 209722 ns 208153 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4958 ns 4833 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5000 ns 4917 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5125 ns 5250 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4917 ns 4875 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 221201.5 ns 217286 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 9546335 ns 10433598 ns 0.91
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 585401 ns 579966 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8708 ns 7709 ns 1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8833.5 ns 9042 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9812.5 ns 10083.5 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8625 ns 9125 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 118248.5 ns 114799.5 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3665770 ns 3549639 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 71271 ns 74946 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8959 ns 9125 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9041.5 ns 8459 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9333.5 ns 8958 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8687.5 ns 8542 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 566922 ns 551169 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 21117090.5 ns 20846570.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 343484 ns 344893 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 126584 ns 126604.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 96271 ns 129541 ns 0.74
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 96479.5 ns 130458 ns 0.74
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 183375 ns 182896 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46672 ns 46147 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/oneAPI 72329738.5 ns 73869604 ns 0.98
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 99821 ns 104461 ns 0.96
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 330333 ns 341208 ns 0.97
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 166292 ns 327416 ns 0.51
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 170250 ns 345562.5 ns 0.49
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 572041.5 ns 569312.5 ns 1.00
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 187343 ns 183705 ns 1.02
batchedmm(128, Bsize=4)/zygote/GPU/oneAPI 93991840 ns 92315110 ns 1.02
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 487975 ns 502435 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 398958 ns 399333 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 215334 ns 288167 ns 0.75
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215041 ns 288020.5 ns 0.75
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 753500 ns 755875 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43980 ns 43522.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1471446 ns 1347689.5 ns 1.09
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 81451 ns 80731 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1401520.5 ns 1404291.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 862917 ns 1136208 ns 0.76
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 861417 ns 1136375 ns 0.76
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2361042 ns 2442125 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 253211 ns 242542 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 10756472 ns 9970984 ns 1.08
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 349378.5 ns 353108.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 651917 ns 643667 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 658334 ns 649416 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 662479 ns 646791.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 579395.5 ns 640271.5 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 189789 ns 184288 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8425868.5 ns 8427619.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 261218 ns 303113 ns 0.86
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2487416 ns 2480000 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2468708 ns 2441417 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2451333 ns 2445375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2415666 ns 2435667 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 951768.5 ns 927220.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52278704 ns 53936788.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1454255 ns 1316013 ns 1.11
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33000 ns 33875 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 36083.5 ns 35271 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 32167 ns 34937.5 ns 0.92
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 1041.5 ns 917 ns 1.14
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16094 ns 15816 ns 1.02
batchedmm(2, Bsize=32)/forward/GPU/oneAPI 73434519 ns 76295890 ns 0.96
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 77491 ns 79581 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3187 ns 3209 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3208 ns 3291 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3417 ns 3417 ns 1
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3209 ns 3042 ns 1.05
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 136515 ns 134276.5 ns 1.02
batchedmm(2, Bsize=32)/zygote/GPU/oneAPI 97956919.5 ns 97067229 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 349978 ns 337123 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 437166.5 ns 437667 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 433083 ns 437833 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 434750 ns 438458.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 449916 ns 447416.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 42836 ns 42161.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1439530.5 ns 1435529 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 238823 ns 241817.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4154959 ns 4145167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4268667 ns 4268333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4254625 ns 4271604 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4048000 ns 4025417 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 236422 ns 230700.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 38227667 ns 36716035 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1232498 ns 1427924 ns 0.86
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3959 ns 3875 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3916 ns 3958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34298 ns 34754 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1231913.5 ns 1243353 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 40891 ns 40071 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15583 ns 15458 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15666 ns 16042 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15708 ns 15875 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15459 ns 15625 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 255323 ns 252802 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8849424 ns 8940938 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 170142 ns 171532 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 403708 ns 404167 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 221167 ns 295916 ns 0.75
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 220959 ns 296417 ns 0.75
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 756709 ns 760709 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113380 ns 113187 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1019116 ns 1044690.5 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 89671 ns 89211 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1430083 ns 1444875 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 886645.5 ns 1158416 ns 0.77
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 879208.5 ns 1158333 ns 0.76
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2383084 ns 2464875 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 238474 ns 231034 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 11515696 ns 10580994 ns 1.09
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 354939 ns 352438 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24737 ns 24556 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1220602 ns 1215077.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 210152 ns 207412 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8042 ns 7542 ns 1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7916 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8020.5 ns 7958.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8084 ns 7708 ns 1.05
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 206918.5 ns 202724.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 26025510 ns 26299646 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 691747 ns 691927 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 829437 ns 833708.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 466125 ns 617667 ns 0.75
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 467854 ns 620250 ns 0.75
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1548750 ns 1558000 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130261 ns 134627 ns 0.97
batchedmm(128, Bsize=32)/forward/GPU/oneAPI 75506190.5 ns 75767504.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 166677 ns 232042 ns 0.72
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2692000 ns 2690520.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1529979 ns 2001666.5 ns 0.76
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1534291.5 ns 2002375 ns 0.77
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4940020.5 ns 4923459 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 232798.5 ns 232967 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/oneAPI 103506946 ns 99203033 ns 1.04
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 770132.5 ns 768327.5 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32356 ns 31737 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1255980.5 ns 1097642 ns 1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 48991 ns 46990 ns 1.04
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6583 ns 6250 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6625 ns 6334 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6708 ns 6667 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6625 ns 6500 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 227984 ns 216848.5 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21076541 ns 22868710.5 ns 0.92
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 356278.5 ns 363084 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1758084 ns 1756083 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1756792 ns 1773708.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1737458 ns 1731875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1733750 ns 1723709 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 188495 ns 185580 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8152359 ns 8097715 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 357369 ns 375774 ns 0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4372937 ns 4363834 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4370667 ns 4360063 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4369375 ns 4378875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4362583.5 ns 4369520.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 853700 ns 829356 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 48123461 ns 48033448.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1252878 ns 1396403.5 ns 0.90
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6792 ns 7146 ns 0.95
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7209 ns 9833 ns 0.73
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7333 ns 7250 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 7312.5 ns 6875 ns 1.06
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 22968 ns 21835 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1178755.5 ns 1202854 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 37681 ns 40090.5 ns 0.94
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 48354 ns 68125 ns 0.71
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 69083 ns 66458.5 ns 1.04
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 33542 ns 51312.5 ns 0.65
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 44979 ns 32958.5 ns 1.36
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 210612 ns 205180 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10579631 ns 10713432 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 235022 ns 269342.5 ns 0.87
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21334 ns 22083.5 ns 0.97
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 24750 ns 25042 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 22583.5 ns 24666.5 ns 0.92
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5417 ns 5583 ns 0.97
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18352 ns 17692 ns 1.04
batchedmm(2, Bsize=512)/forward/GPU/oneAPI 88615648 ns 89463574 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 90001 ns 84500.5 ns 1.07
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12187 ns 12041 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 9250 ns 10167 ns 0.91
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 9625 ns 10584 ns 0.91
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 18375 ns 17770.5 ns 1.03
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 219960 ns 217435 ns 1.01
batchedmm(2, Bsize=512)/zygote/GPU/oneAPI 148558454.5 ns 145607100.5 ns 1.02
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 383514 ns 372684 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 407000 ns 406209 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 223500 ns 297375 ns 0.75
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 223250 ns 297291 ns 0.75
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762333 ns 762584 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47174.5 ns 46433 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1364423 ns 1403980.5 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 90560 ns 89281 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1429042 ns 1428979 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 893625 ns 1164271 ns 0.77
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 893041 ns 1168292 ns 0.76
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2387667 ns 2470833 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 278164 ns 271835 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 14883883 ns 11893591 ns 1.25
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 378859 ns 378099 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 435708 ns 437000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 431625 ns 440041 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 432333 ns 438959 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 450291 ns 449417 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54012 ns 53469 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1020178 ns 1006988 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 238112 ns 234822 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4144125 ns 4132333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4245667 ns 4262646 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4258583 ns 4266645.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4033625 ns 4029729 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 257888 ns 251074 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32096567.5 ns 31753545 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1222232 ns 1374018.5 ns 0.89
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9459 ns 9542 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 7250 ns 8167 ns 0.89
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7250 ns 8167 ns 0.89
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 13458 ns 13417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24527 ns 23409 ns 1.05
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2168792 ns 2209102 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 211892 ns 211472 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 49500 ns 49709 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 49708 ns 49667 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 49417 ns 50250 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 49208.5 ns 49792 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 339671 ns 333916 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12414783 ns 10942581.5 ns 1.13
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 654987 ns 658106.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 125000 ns 84687.5 ns 1.48
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 89417 ns 90459 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86583 ns 85792 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 120666.5 ns 84021 ns 1.44
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191941.5 ns 191047 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5724606 ns 5785931 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 200372 ns 222432 ns 0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2022250 ns 2027833 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2017666.5 ns 2014979.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2024042 ns 2016229.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2020812.5 ns 2015812.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 516999 ns 505179 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28747125 ns 28452120 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1090611 ns 1086300 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.