Skip to content

Commit

Permalink
ci: use sources for docs (#1100)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 22, 2024
1 parent 132619c commit fb901ea
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
11 changes: 3 additions & 8 deletions .buildkite/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ steps:
parallelism: 4
plugins:
- JuliaCI/julia#v1:
version: "1.10"
version: "1"
command: julia --code-coverage=user --color=yes --project=docs --threads=auto docs/tutorials.jl
env:
TUTORIAL_BACKEND_GROUP: "CUDA"
Expand All @@ -28,7 +28,7 @@ steps:
parallelism: 4
plugins:
- JuliaCI/julia#v1:
version: "1.10"
version: "1"
command: julia --code-coverage=user --color=yes --project=docs --threads=auto docs/tutorials.jl
env:
TUTORIAL_BACKEND_GROUP: "CPU"
Expand All @@ -50,7 +50,7 @@ steps:
- "tutorial-build-cpu"
plugins:
- JuliaCI/julia#v1:
version: "1.10"
version: "1"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
Expand All @@ -69,11 +69,6 @@ steps:
julia --code-coverage=user --color=yes --project=docs -e '
println("--- :julia: Instantiating project")
using Pkg
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxLib", "lib/LuxCore", "lib/MLDataDevices", "lib/LuxTestUtils", "lib/WeightInitializers", ".")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg));
end
Pkg.develop(dev_pkgs)
Pkg.instantiate()
println("+++ :julia: Building documentation")
include("docs/make.jl")'
Expand Down
8 changes: 8 additions & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,11 @@ StaticArrays = "1"
WeightInitializers = "1"
Zygote = "0.6.70"
julia = "1.10"

[sources]
Lux = { path = "../" }
LuxLib = { path = "../lib/LuxLib" }
LuxCore = { path = "../lib/LuxCore" }
MLDataDevices = { path = "../lib/MLDataDevices" }
LuxTestUtils = { path = "../lib/LuxTestUtils" }
WeightInitializers = { path = "../lib/WeightInitializers" }

1 comment on commit fb901ea

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: fb901ea Previous: 132619c Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4125 ns 3875 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4083.5 ns 4208 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5167 ns 5250 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4250 ns 4333 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60836 ns 61892.5 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10458 ns 10542 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10208.5 ns 10209 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10333 ns 10459 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10292 ns 10417 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 426426 ns 433097 ns 0.98
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1000 ns 1084 ns 0.92
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1291 ns 1291 ns 1
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1437.5 ns 1292 ns 1.11
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1208 ns 1209 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 17928 ns 18531 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4125 ns 4167 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4084 ns 3917 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4167 ns 4250 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3958 ns 4083 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 109688.5 ns 111975 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57625 ns 57583 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38333 ns 46292 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46792 ns 38042 ns 1.23
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81167 ns 83125 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37191 ns 37370 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2025916.5 ns 2031625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2084833.5 ns 2085958 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2091333 ns 2088333.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1993604 ns 2005041 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 194623 ns 198108 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144416 ns 143750 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 147520.5 ns 146063 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 144062.5 ns 145209 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144041 ns 144583.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165620 ns 166112.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1116375.5 ns 1118042 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1135458 ns 1114250 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1116021 ns 1153000 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1117250 ns 1068770.5 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 525200 ns 533468 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3583 ns 3584 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3416 ns 3750 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4417 ns 4417 ns 1
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3750 ns 3958 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 67680 ns 72081 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9083 ns 9000 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9042 ns 8542 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9291 ns 9041 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8916 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 488913 ns 503190.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16583.5 ns 15000 ns 1.11
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15000 ns 15250 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 16937.5 ns 16708 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 14521 ns 15542 ns 0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 55104 ns 55903 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215166.5 ns 214187.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213375 ns 213604.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212833 ns 215395.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213208 ns 212917 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 272083 ns 278881 ns 0.98
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 687.5 ns 750 ns 0.92
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17338 ns 17733 ns 0.98
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1583 ns 1625 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1666 ns 1500 ns 1.11
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1708 ns 1625 ns 1.05
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1625 ns 1583 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 102756.5 ns 105125.5 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7083 ns 7250 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 5833 ns 0.91
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5875 ns 5250 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10083 ns 10084 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23408 ns 24106 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221750 ns 220750 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 231917 ns 228084 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228875 ns 230459 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214167 ns 213708.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 169815.5 ns 169707.5 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23411 ns 23637 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16583.5 ns 16708 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16459 ns 16834 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16709 ns 16875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16791 ns 16625 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 162393 ns 161602 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 569208 ns 578416.5 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 569667 ns 569958 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 570125 ns 579292 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 578750 ns 578291 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113197 ns 113009 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1418708 ns 1417979.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1421583 ns 1419167 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1420834 ns 1424875 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1432291 ns 1426416 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 211123.5 ns 210883 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1076625 ns 1067000 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 938625 ns 958417 ns 0.98
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1353166 ns 1336917 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1298500 ns 1304396 ns 1.00
lenet(28, 28, 1, 64)/forward/GPU/CUDA 277930.5 ns 271759 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5845333 ns 5795104.5 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4593146 ns 4601125 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4960354 ns 4929084 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5524145.5 ns 5750083 ns 0.96
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1090079 ns 1068932 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23601.5 ns 23274 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2166 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2209 ns 2167 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2208 ns 0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 169946.5 ns 171283 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 3666 ns 4333 ns 0.85
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4417 ns 4125 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4709 ns 5083 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4500 ns 4292 ns 1.05
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 65407 ns 66130 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10834 ns 11625 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11292 ns 11458 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11667 ns 12458 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10958 ns 11709 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 453534 ns 452684.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6167 ns 6375 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7479.5 ns 6959 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8500 ns 8229.5 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6375 ns 6916 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52550.5 ns 52019 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16583 ns 16875 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17500 ns 17000 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 19833 ns 18166 ns 1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16625 ns 17542 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 303262 ns 301500.5 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 542 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 667 ns 542 ns 1.23
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 666 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 31843 ns 32512 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8542 ns 8500 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8875 ns 8750 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9250 ns 9500 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8208 ns 8959 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 159642 ns 157915 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64792 ns 64542 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64542 ns 64625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64542 ns 64750 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64375 ns 64875 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111120 ns 111658.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 280042 ns 279708 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 291791 ns 283750 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 279250 ns 293250 ns 0.95
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 277208 ns 284521 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 184735.5 ns 185586.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3278875 ns 3282500 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 2813375 ns 3076875 ns 0.91
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3029687.5 ns 2795834 ns 1.08
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3938209 ns 4063541.5 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 578907.5 ns 567714 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7620083 ns 7638583 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7352417 ns 7366000 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7457271 ns 7289042 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8189500 ns 8172916 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1328385 ns 1335450 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17561125 ns 17555833 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17648625 ns 17413291.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17534459 ns 17640417 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14095167 ns 14085667 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23588417 ns 23644667 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 44459541 ns 33391375 ns 1.33
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37064416.5 ns 40912708 ns 0.91
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34977333.5 ns 35048479 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1845684 ns 1855237.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189659041 ns 189754584 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 250146875 ns 232353000 ns 1.08
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 193409375 ns 201284750 ns 0.96
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 434181959 ns 435226125 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 18049039.5 ns 13860033 ns 1.30
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 290672125 ns 290571042 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 356317062.5 ns 334832916 ns 1.06
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296289666.5 ns 303703583 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 392800437.5 ns 393811604 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22875 ns 21541 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22938 ns 22375 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24562.5 ns 23354 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24416 ns 24500 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 96194.5 ns 95582 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103875 ns 103250 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103416 ns 115312.5 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104292 ns 104625 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103125.5 ns 102667 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 506291.5 ns 503695.5 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5917 ns 5750 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6000 ns 5791 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6584 ns 7666 ns 0.86
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6209 ns 6250 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 68552.5 ns 68642 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15166.5 ns 14875 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15500 ns 14625 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15542 ns 16250 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14958 ns 14833 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 480464 ns 478112.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 2996875 ns 3019792 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2072750 ns 2069896 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2257667 ns 2279000 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4838583 ns 4750917 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 584192 ns 583001 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23549437 ns 23604770.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18342167 ns 18003875 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17896791 ns 18293125 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35570625 ns 35919729.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2764116 ns 3106744 ns 0.89
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33587937.5 ns 33297687 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28029333 ns 27474958 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28377209 ns 29070229.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41334187.5 ns 41830959 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75479 ns 73396 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73958.5 ns 75125 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74125 ns 74875 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72166 ns 72959 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104339 ns 103514 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 203458.5 ns 274208 ns 0.74
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 280916.5 ns 205959 ns 1.36
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 209583 ns 255333 ns 0.82
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216291.5 ns 296916 ns 0.73
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 562778.5 ns 554316 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11708 ns 11167 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12833 ns 11875 ns 1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13042 ns 13458 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11917 ns 12458 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 72705 ns 72256.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26645.5 ns 26583.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26458 ns 26833 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27458 ns 28084 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26792 ns 26708 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 488247 ns 483481.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12000 ns 11520.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13750 ns 13041 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14000 ns 13750 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12500 ns 12875 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 55166 ns 52959.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25583 ns 25500 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26416 ns 25542 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26375 ns 26375 ns 1
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 28167 ns 26542 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 313572.5 ns 310926 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 181541.5 ns 179125 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 181104 ns 182625 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 181895.5 ns 183958 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 181916 ns 182416 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 59339.5 ns 58111 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 612417 ns 582958 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 590459 ns 583209 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 583541 ns 610042 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582416 ns 582000 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 294347 ns 286370 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5854.5 ns 5729.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 6334 ns 1.11
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7167 ns 7500 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6042 ns 6083 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 72861 ns 71136.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14208.5 ns 14167 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14333 ns 14500 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15084 ns 15667 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14208 ns 14667 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 476457 ns 468005 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1198334 ns 1186749.5 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1236458 ns 1247334 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1270167 ns 1282666.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1009834 ns 841729 ns 1.20
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301349 ns 301667 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4121104 ns 4101771 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4571459 ns 4417458 ns 1.03
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4583146 ns 4790916 ns 0.96
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3708333 ns 3731833.5 ns 0.99
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1054428 ns 1043818 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1792 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 24401 ns 23460 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4792 ns 4875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4875 ns 4834 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5042 ns 4917 ns 1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4958 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 192852.5 ns 189873 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5916.5 ns 5792 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6625 ns 6125 ns 1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7625 ns 7187.5 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5916 ns 6208 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 57663 ns 55970.5 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10562.5 ns 10625 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11417 ns 11083 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12083 ns 11584 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10459 ns 11500 ns 0.91
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 339260 ns 332298.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 375 ns 375 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23460 ns 22660 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2791 ns 2708 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2792 ns 2750 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2709 ns 3000 ns 0.90
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2791 ns 2709 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 162941.5 ns 159360 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11542 ns 11292 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12209 ns 11792 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13875 ns 13250 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11583 ns 12229.5 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 59011.5 ns 57130.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24375 ns 24708 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24583 ns 24167 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25208 ns 25854 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24792 ns 24916.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 303188 ns 300198 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4208 ns 4125 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4208 ns 4250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 25111 ns 24574 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16042 ns 16166 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 15917 ns 16000 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16291 ns 16042 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16291 ns 16375 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 202144.5 ns 201392 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 5750 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5833 ns 5750 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5916 ns 5875 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 5916 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34056 ns 33153 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20520.5 ns 20333 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21000 ns 20792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21167 ns 20917 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21333 ns 21375 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 179609.5 ns 175780 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 425458.5 ns 417417 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 364854.5 ns 378854.5 ns 0.96
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 482520.5 ns 487270.5 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 103125 ns 103917 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67737 ns 66399.5 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 906625 ns 877583 ns 1.03
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 982042 ns 949562.5 ns 1.03
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1181333 ns 1206625 ns 0.98
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 377458 ns 469167 ns 0.80
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 194135 ns 191112 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81333 ns 85417 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82041 ns 81083 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84291 ns 84625 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81813 ns 85417 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194522 ns 193239.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1927625 ns 1913750 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1941000 ns 1913542 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1930917 ns 1943083.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1842062 ns 1906896 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 390656 ns 406558 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 333 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22388 ns 22047.5 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 171479 ns 171306.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6542 ns 6209 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7083.5 ns 6625 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8020.5 ns 8542 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6500 ns 7125 ns 0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60274 ns 60422 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8917 ns 9000 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9417 ns 8958 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9916 ns 9584 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9208 ns 9416 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 311149 ns 313100.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120884833.5 ns 119013624.5 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181722750 ns 174073709 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148231625 ns 154836458 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 108144417 ns 106465208 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5478841 ns 5473107.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 615355583.5 ns 615549000 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 581447666.5 ns 555627500 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 451634708.5 ns 469486625 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 757933250.5 ns 758488604 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34994190 ns 34956527 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 649420209 ns 650955333 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 687787021 ns 665997520.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 584232000.5 ns 596311875 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 744942000 ns 746344250 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59500 ns 59041 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39125 ns 47750 ns 0.82
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 48020.5 ns 39041 ns 1.23
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83458 ns 84708.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38331 ns 36941 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1946625 ns 1922166 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1985458 ns 1978041 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1983521 ns 1990167 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1887334 ns 1920167 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 176268 ns 173728 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 265750 ns 282041.5 ns 0.94
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 268104.5 ns 266458 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 269291.5 ns 273853.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 265125 ns 270333 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 125359 ns 135453.5 ns 0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 690208 ns 674666 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 658417 ns 684354 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 603125 ns 676145.5 ns 0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 594458 ns 596375 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 701612 ns 752272.5 ns 0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2169417 ns 2253417 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2237833 ns 2217895.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2188625 ns 2190479 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2203000 ns 2202416.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133751 ns 133169 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5513083.5 ns 5479500 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5572520.5 ns 5506916 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5508208 ns 5588312.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5485271 ns 5564021 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 720574 ns 794371.5 ns 0.91
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 638458 ns 646958 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 640250 ns 656500 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 640416 ns 640416 ns 1
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 642666.5 ns 657291 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46893.5 ns 47817 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1824209 ns 1822375 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1666417 ns 1719708 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1728208 ns 1665541 ns 1.04
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2102708 ns 2108083 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 220656.5 ns 227850 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58500 ns 58458 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38584 ns 45083 ns 0.86
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46208 ns 38041 ns 1.21
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83042 ns 84958 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28530.5 ns 28842 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2056084 ns 2030375 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2102729.5 ns 2084312.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2102270.5 ns 1787459 ns 1.18
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1992792 ns 2014583.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 189031.5 ns 192397.5 ns 0.98
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13396167 ns 13382625 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12488625 ns 12433458.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12567208 ns 12571375 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 14924083 ns 15143562.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 512412.5 ns 514602 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47267416.5 ns 47546916 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 42078000 ns 41875708 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 40824125 ns 41161020.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58451854 ns 58396167 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2895350 ns 3251545 ns 0.89
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 74360062.5 ns 75047125 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 91413375 ns 67897459 ns 1.35
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90659959 ns 90940166.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 76716041 ns 99460667 ns 0.77
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59208 ns 58750 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38833 ns 46875 ns 0.83
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47125 ns 38333 ns 1.23
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 78625 ns 80334 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48139.5 ns 46475 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1938145.5 ns 1921416 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1984167 ns 1976416 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1977812.5 ns 1721708.5 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1877083 ns 1905000 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195830.5 ns 190253.5 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 417 ns 0.80
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32688 ns 31709.5 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6083 ns 6125 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6334 ns 6208 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6666 ns 6583 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6000 ns 6854.5 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 173538 ns 176344 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 333 ns 291 ns 1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32105 ns 31144 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2584 ns 2625 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2792 ns 2625 ns 1.06
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2791 ns 2833 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2584 ns 2750 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 160748.5 ns 164923.5 ns 0.97
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 287049250 ns 285479083.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 347795687.5 ns 340672292 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 314367979.5 ns 320528833.5 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 271524458 ns 267627833 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7120410.5 ns 7061953.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1003307875 ns 1000752000 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 964885125 ns 941508917 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 835293000 ns 849741542 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1152976875 ns 1162624583 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34058870 ns 33972568.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1312833396 ns 1314224145.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1706336084 ns 1312834041.5 ns 1.30
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1599191959 ns 1621294583 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1309056604.5 ns 1681368042 ns 0.78
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1408791 ns 1461562.5 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1452791.5 ns 1416958 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1449625 ns 1414750 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1407209 ns 1412375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128282.5 ns 127713.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5034917 ns 5020125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5065916.5 ns 5027042 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5035937.5 ns 4740833 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5012729 ns 5044042 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 483777.5 ns 510137 ns 0.95
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 171224875 ns 171071812.5 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 167755167 ns 126739625 ns 1.32
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 128923708 ns 146147041 ns 0.88
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 154904187 ns 168329334 ns 0.92
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4889428.5 ns 4881506 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 621337542 ns 622612209 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 581831583 ns 538980667 ns 1.08
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 460212833 ns 504257334 ns 0.91
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 643084792 ns 656863250 ns 0.98
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16318390 ns 16684647 ns 0.98
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8919875 ns 8964583 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 9050687.5 ns 8900333 ns 1.02
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7921583 ns 7993333 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9747084 ns 9790312.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1600463.5 ns 1594468.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36566209 ns 36115750.5 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 38511167 ns 36971083.5 ns 1.04
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33595375 ns 34444208 ns 0.98
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 37796583 ns 37794834 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6471792 ns 6465190.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47291 ns 47292 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47479.5 ns 47542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47729.5 ns 47584 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47334 ns 47500 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18559 ns 18793 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50416 ns 50291.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50417 ns 50417 ns 1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50417 ns 50833 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50375 ns 50750 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 167009.5 ns 231220 ns 0.72
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6459 ns 6291 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7770.5 ns 7084 ns 1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8041 ns 7792 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7000 ns 7542 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 76373.5 ns 106604.5 ns 0.72
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10000 ns 10209 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10458 ns 9833 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 10270.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10084 ns 10459 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 456260 ns 619990 ns 0.74
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5708 ns 5792 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6708 ns 6416 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7458 ns 7958 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5917 ns 6042 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 91945.5 ns 121725 ns 0.76
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12917 ns 13375 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13625 ns 13000 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13416 ns 13584 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13292 ns 13375 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 417439.5 ns 528027 ns 0.79
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 958 ns 1000 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1042 ns 959 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1042 ns 1125 ns 0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32442 ns 31705 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7792 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7875 ns 7667 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8291 ns 8209 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7834 ns 8666 ns 0.90
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 192614 ns 204125.5 ns 0.94
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23250 ns 23000 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23250 ns 23084 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23416 ns 23584 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23292 ns 23500 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18706.5 ns 18461 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52417 ns 52458 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52625 ns 52291 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52959 ns 52791 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52875 ns 52458 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 226057.5 ns 286087.5 ns 0.79
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1403937.5 ns 1397209 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1409291.5 ns 1395917 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1405208 ns 1400209 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1402896 ns 1398500 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196688.5 ns 195540.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5027625 ns 5008458.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5036500.5 ns 5018750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5008875 ns 4722750 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5003083.5 ns 4703042 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 565308 ns 626852.5 ns 0.90
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3058166 ns 3063416 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2060229 ns 2063875 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2301833 ns 2311417 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4897625 ns 4823500 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 586278 ns 580360 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24473708.5 ns 24332959 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19098958 ns 18875458 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18981042 ns 18989334 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37019125 ns 36748479.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2831934 ns 3188758 ns 0.89
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34098417 ns 34048562.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28724166.5 ns 28257854 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28239458 ns 28468541.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41378063 ns 41851021 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 146235958 ns 144123292 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 147965500 ns 147912291 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 127304667 ns 128219729 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 172673353.5 ns 175666645.5 ns 0.98
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22564119 ns 22797470 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1235304437.5 ns 1274551333 ns 0.97
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 869077229.5 ns 1209986250 ns 0.72
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 769904041 ns 717258459 ns 1.07
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 666199333 ns 669341542 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118146881 ns 118134658 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 73812 ns 75042 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73875 ns 73833 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 75687.5 ns 75813 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 76416 ns 74125 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 208579 ns 248024.5 ns 0.84
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 295500 ns 202750 ns 1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 193958 ns 283250 ns 0.68
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 287395.5 ns 194000 ns 1.48
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 282729 ns 189583 ns 1.49
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1165959 ns 1272660.5 ns 0.92
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35776083 ns 35542000 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36529041 ns 36428479 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32581292 ns 32734792 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40338396 ns 40941958 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5849817 ns 5852888 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 148302541 ns 147574354 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 158881084 ns 154842271 ns 1.03
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 138956354.5 ns 142249771 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 284123584 ns 285430916 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34596502 ns 34907859 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120211625 ns 119543458.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 182136458 ns 173916625 ns 1.05
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148062084 ns 155928584 ns 0.95
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 105814875 ns 103545938 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5475710.5 ns 5470774 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 469150645.5 ns 471171395.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 486184250 ns 467366000 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 437949792 ns 456719729 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 739059333 ns 738831458 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32333012 ns 32277660 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 712730687.5 ns 709159062 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 678064125 ns 654555208.5 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 570651646 ns 585803354.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 732192500 ns 726547959 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1338854 ns 1242646 ns 1.08
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 764333 ns 968625.5 ns 0.79
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 971166 ns 674709 ns 1.44
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2047291 ns 1941770.5 ns 1.05
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 582645.5 ns 569058 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2995792 ns 2969916 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2516000 ns 2603708 ns 0.97
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2623541.5 ns 1985166.5 ns 1.32
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3683208 ns 3729625 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1752698 ns 1762089 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5821709 ns 5801458 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5892750 ns 5780958 ns 1.02
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5806979 ns 5645834 ns 1.03
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2887229 ns 2921042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7500 ns 7250 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 5958 ns 0.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 5333 ns 1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10041 ns 10083 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25775 ns 25119 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 225958.5 ns 215750 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220750 ns 258458 ns 0.85
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220625 ns 221291.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206167 ns 207146 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 259112 ns 264756 ns 0.98
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 308668791.5 ns 308377104 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 282575646 ns 231656291 ns 1.22
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 199775042 ns 224042396 ns 0.89
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 309205458 ns 307881333 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7688394 ns 7678620 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1093080750 ns 1097604312.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 1075916375 ns 920148521 ns 1.17
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 810723875 ns 858485833.5 ns 0.94
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1146255478.5 ns 1150798750 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26478179 ns 26497955 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5042 ns 4958.5 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6250 ns 5583 ns 1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6584 ns 6916.5 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5458 ns 5541 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 170923.5 ns 171524 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7333 ns 7542 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7416 ns 6750 ns 1.10
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7417 ns 7458 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7041 ns 7875 ns 0.89
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 648059.5 ns 670577.5 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 541 ns 1.16
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 583 ns 541 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 541 ns 625 ns 0.87
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24468 ns 23778 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9333 ns 8708 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9000 ns 8541.5 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9729.5 ns 9458 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 8792 ns 9541.5 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 223281 ns 233071 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 351708 ns 353250 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 352583 ns 353208 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352708 ns 352667 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 351416.5 ns 352125 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21843 ns 21348 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 811563 ns 822333 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 793583.5 ns 774854 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 812375 ns 777042 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 804291 ns 825999.5 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 279114.5 ns 286748 ns 0.97
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 338875 ns 336833 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 321459 ns 335917 ns 0.96
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 450271 ns 445708 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 10750 ns 10917 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18538 ns 17559 ns 1.06
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 712021 ns 713499.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 730333 ns 730834 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1002270.5 ns 1027167 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 26708 ns 26500 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 261073.5 ns 260521.5 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 381875 ns 371375 ns 1.03
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 326167 ns 346250 ns 0.94
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 443625 ns 445812.5 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 30417 ns 30479 ns 1.00
batchedmm(16, Bsize=128)/forward/GPU/CUDA 23393 ns 22136 ns 1.06
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 731937.5 ns 734062.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 784187.5 ns 773750.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1027875 ns 1061729 ns 0.97
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 89584 ns 98521 ns 0.91
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 220484 ns 220018.5 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3375 ns 3375 ns 1
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3708 ns 3542 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3833 ns 3687.5 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3458 ns 3583 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17892 ns 17780 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4292 ns 4125 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4250 ns 4167 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4333 ns 4375 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4417 ns 4500 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 288266.5 ns 258504 ns 1.12
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4083 ns 3750 ns 1.09
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4062.5 ns 3500 ns 1.16
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4334 ns 4917 ns 0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3833 ns 4083 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 243078.5 ns 200777 ns 1.21
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8417 ns 8417 ns 1
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8208 ns 8000 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 8625 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8500 ns 8604.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1294141 ns 1183716 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203583 ns 205708 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209750 ns 210125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209750 ns 210375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199542 ns 200375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35748 ns 34375 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 610959 ns 650916 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 629979 ns 666959 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 632042 ns 624167 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 624312.5 ns 632458 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 366873 ns 343648 ns 1.07
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 1020270.5 ns 1000479 ns 1.02
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1019375 ns 1007958 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 956541 ns 974396 ns 0.98
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 862917 ns 894770.5 ns 0.96
batchedmm(128, Bsize=128)/forward/GPU/CUDA 208035 ns 207021.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4555583 ns 4512146 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4847250 ns 4708729.5 ns 1.03
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4461541 ns 4609875 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 5174375 ns 5171208.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 927061 ns 947853.5 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4042 ns 3333 ns 1.21
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3500 ns 3083 ns 1.14
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4250 ns 4333 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3375 ns 3917 ns 0.86
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 241039.5 ns 218377.5 ns 1.10
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7375 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7062.5 ns 6833 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7333 ns 7458 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6916 ns 7459 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1063926.5 ns 1012916 ns 1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1524958 ns 1641584 ns 0.93
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1178854.5 ns 1193979 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1368709 ns 1342687.5 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2362167 ns 2486625.5 ns 0.95
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 218600.5 ns 214048 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12347875 ns 12366291.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9603708 ns 9556958 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9285208.5 ns 9332500 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 17994500 ns 18065166.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1959865.5 ns 1946882 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17343125 ns 17346750 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14424146 ns 14347000 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14365583 ns 14486917 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21176708 ns 21148167 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 90520.5 ns 134750 ns 0.67
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90208 ns 88584 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 94500 ns 92042 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 133292 ns 89042 ns 1.50
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126385 ns 126624 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2059229.5 ns 2031958 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2014083.5 ns 2023083.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2030292 ns 1756000 ns 1.16
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2020416.5 ns 2029583 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1061374.5 ns 1029084 ns 1.03
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 2375 ns 1750 ns 1.36
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 1834 ns 2833 ns 0.65
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 3542 ns 2458 ns 1.44
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 2167 ns 2166.5 ns 1.00
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16672 ns 16055 ns 1.04
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2541 ns 2583 ns 0.98
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2917 ns 2500 ns 1.17
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2750 ns 2750 ns 1
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2792 ns 2750 ns 1.02
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 197485.5 ns 191618 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7416 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5416 ns 5917 ns 0.92
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5958 ns 5125 ns 1.16
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9916 ns 10166 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34400.5 ns 33917 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213812.5 ns 226396.5 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221000 ns 222521 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231917 ns 221584 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 208604 ns 207458 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 352524 ns 311723.5 ns 1.13
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3667 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3708 ns 3667 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22677 ns 22860 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14416 ns 14458 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14125 ns 14291 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14500 ns 14250 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14417 ns 14667 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 511650.5 ns 472859.5 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 93854 ns 137417 ns 0.68
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 97145.5 ns 96458.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 98417 ns 95833 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 140083 ns 93125 ns 1.50
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125784 ns 125940 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1964729 ns 1921458.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1938562.5 ns 1918166.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1927041.5 ns 1817687.5 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1920667 ns 1914458 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1039090 ns 951464 ns 1.09
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 877500 ns 869042 ns 1.01
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 800812.5 ns 815167 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1223937 ns 1175833 ns 1.04
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 969958 ns 967562.5 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 285567 ns 276671 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2803854 ns 2830583 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2511750 ns 2508062.5 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3356541.5 ns 3332875 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3428708 ns 3328000 ns 1.03
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1675606 ns 1576106.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15958 ns 16000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16562.5 ns 15625 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17041.5 ns 16458 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17375 ns 16417 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 145484 ns 143900.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223104 ns 255875.5 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222896 ns 254271 ns 0.88
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 226708 ns 216250 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 253167 ns 258021 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 664599 ns 637843.5 ns 1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221146 ns 220792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 221500 ns 220667 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 221666.5 ns 221208 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 221042 ns 222208.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 276464 ns 270997 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 551791 ns 504458 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 505375 ns 507416.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 509750 ns 499833.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 508666.5 ns 498875.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1493627 ns 1304306.5 ns 1.15
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 4000 ns 3459 ns 1.16
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 4104.5 ns 3854.5 ns 1.06
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 4667 ns 5375 ns 0.87
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 4042 ns 4042 ns 1
batchedmm(16, Bsize=4)/forward/GPU/CUDA 17326 ns 16660 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7042 ns 7166 ns 0.98
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 7417 ns 6458 ns 1.15
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7250 ns 7209 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7458 ns 7541.5 ns 0.99
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 198652.5 ns 194930.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17875 ns 17666 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18333 ns 17125 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19750 ns 19729 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17146 ns 18000 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 230076 ns 146357.5 ns 1.57
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219250 ns 244562 ns 0.90
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 216020.5 ns 237417 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212500 ns 214500 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212479.5 ns 225208 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1050719 ns 894981 ns 1.17
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4500 ns 4416 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4583 ns 3917 ns 1.17
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4667 ns 5334 ns 0.87
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4583 ns 4833 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 252077 ns 187684 ns 1.34
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10833 ns 10500 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10500 ns 9708 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10250 ns 11167 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10250 ns 11250 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1102570 ns 1024651 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3312.5 ns 3209 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3708 ns 3250 ns 1.14
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 3959 ns 4687.5 ns 0.84
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3125 ns 3791 ns 0.82
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 243703 ns 218725.5 ns 1.11
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7229.5 ns 7833 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7333 ns 7291 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7417 ns 7625 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7209 ns 7917 ns 0.91
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1111590.5 ns 1043721.5 ns 1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23487541.5 ns 23437104.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43971125 ns 35045979.5 ns 1.25
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37463166.5 ns 41490500 ns 0.90
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34877416 ns 34913479 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1842834.5 ns 2126334.5 ns 0.87
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184200958 ns 184798459 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 173422437.5 ns 159330000 ns 1.09
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146460271 ns 151477459 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 410950833 ns 411547250 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16526176 ns 16524151 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 425975000 ns 427197208 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 259298209 ns 252723645.5 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296349208.5 ns 305721250 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 479307000 ns 481095166 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 183167 ns 182854.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183917 ns 182791.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185291.5 ns 185292 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183708.5 ns 185750 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 232992 ns 173677.5 ns 1.34
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 588709 ns 629833 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 595709 ns 631375 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 596042 ns 590542 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 597500 ns 630770.5 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1113560 ns 1010062 ns 1.10
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 4043292 ns 3848041.5 ns 1.05
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 4012396 ns 4009000 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3557000 ns 3525583 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4569124.5 ns 4614917 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 531536 ns 536882 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17494562.5 ns 17371917 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 18560917 ns 17740624.5 ns 1.05
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16622646 ns 16856312.5 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 20213416.5 ns 20403334 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2619803.5 ns 2613028 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32024.5 ns 31917 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9334 ns 9334 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9291 ns 8708 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9666.5 ns 9875 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9000 ns 9417 ns 0.96
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 264542.5 ns 260614 ns 1.02
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 496971791 ns 503086958 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 509285541 ns 424620083.5 ns 1.20
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 421912146 ns 462339520.5 ns 0.91
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 672227417 ns 673052062 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12489793.5 ns 12478664.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1883911021 ns 1872018104.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1668824291 ns 1625413500 ns 1.03
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1489797958.5 ns 1546440125 ns 0.96
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2201017208.5 ns 2200566458.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49197806.5 ns 49139909 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1600645.5 ns 1647791.5 ns 0.97
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1172708 ns 1202542 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1388125 ns 1365999.5 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2344958.5 ns 2393042 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 218458 ns 215162 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12685750 ns 12703083.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9976000 ns 9880000 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9656709 ns 9761146 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18427396 ns 18559417 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2044469 ns 2005712 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17712834 ns 17693854 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14779375 ns 14669187.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14604916 ns 14767500 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21383042 ns 21469542 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26208 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26208 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26208 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 24118 ns 23799 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67000 ns 66666 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66833 ns 66750 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67500 ns 67209 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66834 ns 67500 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 410737.5 ns 380551.5 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203917 ns 203917 ns 1
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 208625 ns 209750 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209084 ns 210000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199500 ns 199958 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27195 ns 25800 ns 1.05
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 625958.5 ns 648229.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 629916 ns 661271 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 632125 ns 622750 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 600062.5 ns 586375 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 358637.5 ns 308724.5 ns 1.16
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 658417 ns 600291 ns 1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 641625 ns 594125 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 647542 ns 544666 ns 1.19
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 666291.5 ns 652208 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132681.5 ns 131751 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2274708 ns 2235000 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2300125 ns 2235625 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2238125 ns 2300854 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2241291 ns 2253125 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1242340 ns 1127758 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18020.5 ns 17541 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18292 ns 16958 ns 1.08
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20250 ns 19917 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17500 ns 17958 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 146876.5 ns 145385 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 231458 ns 261583 ns 0.88
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227333.5 ns 260812.5 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 227500 ns 220937.5 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 229792 ns 230896 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1067171 ns 982925 ns 1.09
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 667 ns 542 ns 1.23
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 625 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23878 ns 23015 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9833 ns 9479.5 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9875 ns 9042 ns 1.09
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10000 ns 10292 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9541 ns 9625 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 263281 ns 257388 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5687.5 ns 5458 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6208 ns 5417 ns 1.15
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7125 ns 6625 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5417 ns 6083 ns 0.89
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 235834 ns 233603.5 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7250 ns 7083 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8042 ns 7041 ns 1.14
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7541.5 ns 7833 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6979.5 ns 7375 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 811982.5 ns 800650 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2125 ns 2000 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2312.5 ns 2125 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2500 ns 2458 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2125 ns 2459 ns 0.86
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 18261 ns 17988 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6375 ns 6500 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6520.5 ns 6291 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6708 ns 6708 ns 1
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6375 ns 6542 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 336632.5 ns 330671 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749209 ns 749709 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 748895.5 ns 747104 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749542 ns 749208 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 754083 ns 751791.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21329 ns 21045 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 818750 ns 791000 ns 1.04
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 788167 ns 791062.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 791584 ns 775875 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 790584 ns 775250 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 299791 ns 294695 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7500 ns 7208 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5334 ns 5958 ns 0.90
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5916 ns 5291 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10208 ns 10208 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33718 ns 32534 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 256167 ns 233291 ns 1.10
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 235520.5 ns 267375 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 240500 ns 227812.5 ns 1.06
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 250875 ns 213583 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 365654 ns 361573 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10312.5 ns 10020.5 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10416 ns 10042 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10812.5 ns 11625 ns 0.93
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10166.5 ns 10208 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 245731 ns 248981.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25083 ns 26791 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24667 ns 24292 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24125 ns 24750 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24500 ns 25000 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1139764 ns 1132389 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106439229 ns 107227250 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 127176500 ns 117058791.5 ns 1.09
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120453645.5 ns 124034229 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117602312.5 ns 117545541.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2646453 ns 2659866 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 394264417 ns 393155000 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 380211666 ns 366597250 ns 1.04
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 421708312.5 ns 357674666 ns 1.18
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 479818917 ns 490403667 ns 0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15158878 ns 15157994 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 756832624.5 ns 758865499.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 775894292 ns 580033084 ns 1.34
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 748243271.5 ns 748265062.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 761933208.5 ns 948608916.5 ns 0.80
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7145.5 ns 6916.5 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7834 ns 7000 ns 1.12
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9541 ns 8042 ns 1.19
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7417 ns 7625 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 241749 ns 242461.5 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14291.5 ns 14084 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14166 ns 13500 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14167 ns 14208 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13708 ns 14333 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1098247 ns 1085062 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6042 ns 5541 ns 1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6750 ns 6563 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7083 ns 7666 ns 0.92
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5834 ns 6291 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 240471.5 ns 235371.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12667 ns 12542 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13333 ns 12104.5 ns 1.10
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13354.5 ns 13042 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12334 ns 12750 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 800476.5 ns 793450.5 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5333 ns 5125 ns 1.04
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5875 ns 5750 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 6000 ns 6333 ns 0.95
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5500 ns 5625 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17559 ns 16571 ns 1.06
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15459 ns 15792 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15437.5 ns 15417 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15667 ns 15625 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15750 ns 15750 ns 1
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 202574 ns 200110.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 417 ns 292 ns 1.43
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 416 ns 416 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 417 ns 0.70
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24102 ns 23594.5 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6333 ns 5959 ns 1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6209 ns 6083 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6750 ns 6666 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6834 ns 0.93
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 242831.5 ns 242427.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5916 ns 5833 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5834 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5958 ns 6000 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5792 ns 6041 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25033 ns 24342.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21375 ns 20875 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21125 ns 21042 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21375 ns 21666 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21020.5 ns 21875 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 267836 ns 262727.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144833 ns 185833 ns 0.78
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 145250 ns 144916.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 150083.5 ns 146875 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 188375 ns 144416.5 ns 1.30
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168310 ns 167734 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1351833 ns 1323750 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1369333 ns 1312209 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1322041 ns 1332875 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1327250 ns 1333770.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1368007 ns 1339118 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23042 ns 24041.5 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24041 ns 22312.5 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24917 ns 24833 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21833 ns 24667 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 356401.5 ns 351890.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 126958 ns 170708 ns 0.74
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 120333 ns 177875 ns 0.68
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 180250 ns 118625 ns 1.52
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 180749.5 ns 120020.5 ns 1.51
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1484885 ns 1461877 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 416 ns 0.70
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23370 ns 22590 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6479.5 ns 6250 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6416 ns 6250 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7042 ns 6750 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6583 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 260419.5 ns 255552.5 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4333 ns 4291 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5041.5 ns 4417 ns 1.14
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5459 ns 5708 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4583.5 ns 5292 ns 0.87
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 255220 ns 256272 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10166.5 ns 10042 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10167 ns 9833 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 10417 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10208 ns 10333 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1368092 ns 1354208 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1583 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1666 ns 1625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1666 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23227 ns 22798 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5750 ns 5833 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5750 ns 5709 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5750 ns 6000 ns 0.96
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5583 ns 5916 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 278026 ns 274328 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6781854.5 ns 6866624.5 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6363854.5 ns 6433708 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6534166 ns 6554499.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7654958.5 ns 7548875 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 216771 ns 213149 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24093667 ns 24100417 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21335604 ns 21294521 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21037958 ns 21070125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29730292 ns 29826667 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2100300 ns 2116806 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37311042 ns 37336834 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45649479 ns 34197292 ns 1.33
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45692458 ns 45794042 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 38098959 ns 49624208 ns 0.77
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5520.5 ns 5750 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6708.5 ns 5625 ns 1.19
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7250 ns 6791 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6125 ns 6667 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 240533.5 ns 236202.5 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8083 ns 8084 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9083 ns 7875 ns 1.15
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8417 ns 8667 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8250 ns 9167 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1077102 ns 1060405 ns 1.02
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1489187.5 ns 1553542 ns 0.96
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1236771 ns 1263041.5 ns 0.98
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1617916 ns 1622041 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2170020.5 ns 2175916 ns 1.00
lenet(28, 28, 1, 128)/forward/GPU/CUDA 282849 ns 272178 ns 1.04
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7909229.5 ns 7902375 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6634750 ns 6258292 ns 1.06
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7161708 ns 7165958 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10483708.5 ns 10478104.5 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1903700.5 ns 1852121.5 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 367625 ns 361584 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 349896 ns 370750 ns 0.94
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 453917 ns 456417 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 24459 ns 24999.5 ns 0.98
batchedmm(128, Bsize=4)/forward/GPU/CUDA 43502 ns 46439.5 ns 0.94
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 727167 ns 738895.5 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 803167 ns 809958 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1057604 ns 1082542 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 121792 ns 76708 ns 1.59
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 307546.5 ns 301861.5 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397583 ns 397459 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 213333 ns 288084 ns 0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288209 ns 212208 ns 1.36
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 751125 ns 755209 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44141 ns 43701 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 675500 ns 665625 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 475667 ns 530417 ns 0.90
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 531375 ns 473750 ns 1.12
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 972666.5 ns 974458 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 191213 ns 189749 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 658208.5 ns 649583 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 643834 ns 641833 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 655125 ns 545458.5 ns 1.20
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 681792 ns 653167 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132164.5 ns 131877 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2526833 ns 2454834 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2530541 ns 2460271 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2451667 ns 2500666 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2454146 ns 2518479 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1206173 ns 1202049 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 2604 ns 3000 ns 0.87
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 2459 ns 3500 ns 0.70
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 4375 ns 3500 ns 1.25
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 2583 ns 2708 ns 0.95
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16766 ns 15904 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5333 ns 5375 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5542 ns 5292 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5583 ns 5666 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5542 ns 5750 ns 0.96
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 199467 ns 196388 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1459833 ns 1465625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1490334 ns 1502708 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1497791 ns 1496875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1439750 ns 1444792 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41167 ns 40558 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5155562 ns 5125396 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5314187.5 ns 5286583 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5282833 ns 5312375 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4979791 ns 4974792 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 198405.5 ns 195790.5 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3750 ns 3708 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33352 ns 32748 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15208 ns 15083 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15000 ns 15083 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15209 ns 15167 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15291 ns 15375 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 379437.5 ns 375651.5 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71625 ns 71125 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71416 ns 71167 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71208 ns 71208 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71083 ns 71083 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113188.5 ns 112958 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 321770.5 ns 323791 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 330770.5 ns 320458 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 319333 ns 326875 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 326458 ns 323000 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 194877 ns 193747 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1000 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 958 ns 1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 959 ns 1084 ns 0.88
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23702 ns 23358 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7917 ns 7875 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8125 ns 7834 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 8458 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7916 ns 8833 ns 0.90
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 263485 ns 259209 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 497624.5 ns 505375 ns 0.98
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 471604 ns 484292 ns 0.97
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 563708 ns 564542 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 218208 ns 215062.5 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129739 ns 128754 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1355292 ns 1371334 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1470187.5 ns 1393812.5 ns 1.05
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1719583.5 ns 1732333 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 867375 ns 870083.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 275487 ns 276302 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31436 ns 31400 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6208 ns 6167 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6333 ns 6000 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6458 ns 6500 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6958 ns 0.91
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 262275 ns 263074.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1727063 ns 1767042 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1729458 ns 1725208 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1725417 ns 1727292 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1768875 ns 1726271 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168537 ns 168554 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4367874.5 ns 4357521 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4385375 ns 4359541 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4367104 ns 4379875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4357459 ns 4377583 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1262273 ns 1157059 ns 1.09
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6708 ns 6666 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6541 ns 6666 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7000 ns 6916 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6875 ns 7041.5 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20525 ns 20567 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 33063 ns 32834 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 33083 ns 51229.5 ns 0.65
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 48041.5 ns 33541.5 ns 1.43
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 53792 ns 51062.5 ns 1.05
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 291536.5 ns 209739.5 ns 1.39
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 17333.5 ns 17250 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 17792 ns 17812.5 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18209 ns 18292 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 17666 ns 17708 ns 1.00
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18396 ns 17907 ns 1.03
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53209 ns 53208 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 53417 ns 52959 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53292 ns 53541 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53375 ns 53291 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 338706.5 ns 344400 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75500 ns 75333 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75417 ns 74959 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75292 ns 75292 ns 1
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75292 ns 75000 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46489 ns 47022 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 329084 ns 325292 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 336667 ns 324417 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 328958 ns 343042 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 323917 ns 327084 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 209091.5 ns 210359 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1486166 ns 1488333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1517709 ns 1527917 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1525792 ns 1521042 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1464375 ns 1466167 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 52406 ns 51138 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5153729.5 ns 5120375 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5303250 ns 5285750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5257500 ns 5309459 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4990145.5 ns 4973917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 203681 ns 202631 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28250 ns 28167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28208 ns 28125 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28250 ns 28208 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28375 ns 28209 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24536 ns 24478 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66708 ns 66208 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66125 ns 66167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66250 ns 66250 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66416 ns 66959 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 535849 ns 533201 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1468041 ns 1463833 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 912854 ns 1144583 ns 0.80
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1130187.5 ns 832188 ns 1.36
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2251604 ns 2217792 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 583084 ns 576305 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3113959 ns 3077958.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2660771 ns 2733167 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2734000 ns 2620334 ns 1.04
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3802646 ns 3782000 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2002672 ns 2001343 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 7929500 ns 7887749.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8011167 ns 7887771 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7911791.5 ns 7989000 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4826833 ns 4832458 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81437.5 ns 134958 ns 0.60
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83395.5 ns 78917 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84437.5 ns 82625 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 136500 ns 81250 ns 1.68
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193251.5 ns 193237.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2033479 ns 2017354.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2014584 ns 2006750 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2016000 ns 2041167 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2013958 ns 2018875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 792396 ns 797402 ns 0.99

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.