Skip to content

Commit

Permalink
chore: use [sources] in Project.toml (#1090)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 17, 2024
1 parent cf8fd61 commit 2331c99
Show file tree
Hide file tree
Showing 15 changed files with 101 additions and 138 deletions.
26 changes: 8 additions & 18 deletions .buildkite/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@ steps:
command: |
julia --color=yes --code-coverage=user --depwarn=yes --project=. -e '
import Pkg;
dev_pkgs = Pkg.PackageSpec[];
for pkg in ("lib/LuxCore", "lib/MLDataDevices", "lib/WeightInitializers", "lib/LuxLib",)
push!(dev_pkgs, Pkg.PackageSpec(path=pkg));
end
Pkg.develop(dev_pkgs);
Pkg.Registry.update();
Pkg.instantiate();
Pkg.activate("test");
Expand All @@ -39,10 +34,10 @@ steps:
end
Pkg.develop(dev_pkgs);
Pkg.instantiate();'
julia --color=yes --code-coverage=user --depwarn=yes --project=test -e '
import Pkg, Lux;
dir = dirname(pathof(Lux));
include(joinpath(dir, "../test/runtests.jl"))'
julia --color=yes --code-coverage=user --depwarn=yes --project=test -e '
import Pkg, Lux;
dir = dirname(pathof(Lux));
include(joinpath(dir, "../test/runtests.jl"))'
env:
BACKEND_GROUP: "CUDA"
LUX_TEST_GROUP: "{{matrix.group}}"
Expand Down Expand Up @@ -79,11 +74,6 @@ steps:
command: |
julia --color=yes --code-coverage=user --depwarn=yes --project=. -e '
import Pkg;
dev_pkgs = Pkg.PackageSpec[];
for pkg in ("lib/LuxCore", "lib/MLDataDevices", "lib/WeightInitializers", "lib/LuxLib",)
push!(dev_pkgs, Pkg.PackageSpec(path=pkg));
end
Pkg.develop(dev_pkgs);
Pkg.Registry.update();
Pkg.instantiate();
Pkg.activate("test");
Expand All @@ -93,10 +83,10 @@ steps:
end
Pkg.develop(dev_pkgs);
Pkg.instantiate();'
julia --color=yes --code-coverage=user --depwarn=yes --project=test -e '
import Pkg, Lux;
dir = dirname(pathof(Lux));
include(joinpath(dir, "../test/runtests.jl"))'
julia --color=yes --code-coverage=user --depwarn=yes --project=test -e '
import Pkg, Lux;
dir = dirname(pathof(Lux));
include(joinpath(dir, "../test/runtests.jl"))'
env:
BACKEND_GROUP: "AMDGPU"
agents:
Expand Down
10 changes: 0 additions & 10 deletions .buildkite/testing_luxlib.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@ steps:
julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxLib -e '
import Pkg;
Pkg.Registry.update();
dev_pkgs = Pkg.PackageSpec[];
for pkg in ("lib/LuxCore", "lib/MLDataDevices")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg));
end;
Pkg.develop(dev_pkgs);
Pkg.instantiate();
Pkg.activate("lib/LuxLib/test");
dev_pkgs = Pkg.PackageSpec[];
Expand Down Expand Up @@ -67,11 +62,6 @@ steps:
julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxLib -e '
import Pkg;
Pkg.Registry.update();
dev_pkgs = Pkg.PackageSpec[];
for pkg in ("lib/LuxCore", "lib/MLDataDevices")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg));
end;
Pkg.develop(dev_pkgs);
Pkg.instantiate();
Pkg.activate("lib/LuxLib/test");
dev_pkgs = Pkg.PackageSpec[];
Expand Down
5 changes: 0 additions & 5 deletions .buildkite/testing_luxtestutils.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ steps:
julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxTestUtils -e '
import Pkg;
Pkg.Registry.update();
dev_pkgs = Pkg.PackageSpec[];
for pkg in ("lib/MLDataDevices",)
push!(dev_pkgs, Pkg.PackageSpec(path=pkg));
end;
Pkg.develop(dev_pkgs);
Pkg.instantiate();
Pkg.test(; coverage="user")'
agents:
Expand Down
32 changes: 10 additions & 22 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ jobs:
- "misc"
- "reactant"
include:
- version: "1.10"
- version: "1"
os: "macos-latest"
test_group: "all"
- version: "1.10"
- version: "1"
os: "windows-latest"
test_group: "all"
steps:
Expand All @@ -65,24 +65,18 @@ jobs:
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- name: "Install Dependencies"
- uses: julia-actions/julia-buildpkg@v1
- name: "Dev Test Dependencies"
run: |
import Pkg
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxCore", "lib/MLDataDevices", "lib/WeightInitializers", "lib/LuxLib",)
push!(dev_pkgs, Pkg.PackageSpec(path=pkg))
end
Pkg.develop(dev_pkgs)
Pkg.Registry.update()
Pkg.instantiate()
Pkg.activate("test")
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxTestUtils", "lib/LuxLib", "lib/MLDataDevices", "lib/LuxCore", ".")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg))
end
Pkg.develop(dev_pkgs)
Pkg.instantiate()
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=. {0}
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=test {0}
- name: "Run Tests"
run: |
import Pkg, Lux
Expand All @@ -99,7 +93,7 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false

downgrade:
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') && github.base_ref == github.event.repository.default_branch }}
Expand All @@ -112,24 +106,18 @@ jobs:
- uses: julia-actions/julia-downgrade-compat@v1
with:
skip: "LuxCore,MLDataDevices,WeightInitializers,LuxLib"
- name: "Install Dependencies"
- uses: julia-actions/julia-buildpkg@v1
- name: "Dev Test Dependencies"
run: |
import Pkg
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxCore", "lib/MLDataDevices", "lib/WeightInitializers", "lib/LuxLib",)
push!(dev_pkgs, Pkg.PackageSpec(path=pkg))
end
Pkg.develop(dev_pkgs)
Pkg.Registry.update()
Pkg.instantiate()
Pkg.activate("test")
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxTestUtils", "lib/LuxLib", "lib/MLDataDevices", "lib/LuxCore", ".")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg))
end
Pkg.develop(dev_pkgs)
Pkg.instantiate()
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=. {0}
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=test {0}
- name: "Run Tests"
run: |
import Pkg, Lux
Expand All @@ -144,7 +132,7 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false

env:
BACKEND_GROUP: "CPU"
2 changes: 1 addition & 1 deletion .github/workflows/CIPreRelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,4 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false
18 changes: 10 additions & 8 deletions .github/workflows/CI_LuxCUDA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ jobs:
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- name: "Install Dependencies and Run Tests"
- uses: julia-actions/julia-buildpkg@v1
with:
project: "lib/LuxCUDA"
- name: "Run Tests"
run: |
import Pkg
Pkg.Registry.update()
Pkg.instantiate()
Pkg.test(; coverage="user")
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxCUDA {0}
- uses: julia-actions/julia-processcoverage@v1
Expand All @@ -54,7 +55,7 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false

downgrade:
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') && github.base_ref == github.event.repository.default_branch }}
Expand All @@ -65,11 +66,12 @@ jobs:
with:
version: "1.10"
- uses: julia-actions/julia-downgrade-compat@v1
- name: "Install Dependencies and Run Tests"
- uses: julia-actions/julia-buildpkg@v1
with:
project: "lib/LuxCUDA"
- name: "Run Tests"
run: |
import Pkg
Pkg.Registry.update()
Pkg.instantiate()
Pkg.test(; coverage="user")
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxCUDA {0}
- uses: julia-actions/julia-processcoverage@v1
Expand All @@ -80,7 +82,7 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false

env:
BACKEND_GROUP: "CPU"
24 changes: 12 additions & 12 deletions .github/workflows/CI_LuxCore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,18 @@ jobs:
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- name: "Install Dependencies"
- uses: julia-actions/julia-buildpkg@v1
with:
project: "lib/LuxCore"
- name: "Dev Test Dependencies"
run: |
import Pkg
Pkg.Registry.update()
Pkg.instantiate()
Pkg.activate("lib/LuxCore/test")
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxCore", "lib/MLDataDevices")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg))
end
Pkg.develop(dev_pkgs)
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxCore {0}
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxCore/test {0}
- name: "Run Tests"
run: |
import Pkg, LuxCore
Expand All @@ -69,7 +69,7 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false

downgrade:
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') && github.base_ref == github.event.repository.default_branch }}
Expand All @@ -84,18 +84,18 @@ jobs:
with:
version: ${{ matrix.version }}
- uses: julia-actions/julia-downgrade-compat@v1
- name: "Install Dependencies"
- uses: julia-actions/julia-buildpkg@v1
with:
project: "lib/LuxCore"
- name: "Dev Test Dependencies"
run: |
import Pkg
Pkg.Registry.update()
Pkg.instantiate()
Pkg.activate("lib/LuxCore/test")
dev_pkgs = Pkg.PackageSpec[]
for pkg in ("lib/LuxCore", "lib/MLDataDevices")
push!(dev_pkgs, Pkg.PackageSpec(path=pkg))
end
Pkg.develop(dev_pkgs)
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxCore {0}
shell: julia --color=yes --code-coverage=user --depwarn=yes --project=lib/LuxCore/test {0}
- name: "Run Tests"
run: |
import Pkg, LuxCore
Expand All @@ -110,7 +110,7 @@ jobs:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
fail_ci_if_error: true
fail_ci_if_error: false

env:
BACKEND_GROUP: "CPU"
Loading

1 comment on commit 2331c99

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 2331c99 Previous: 3986545 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4125 ns 3792 ns 1.09
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4292 ns 4084 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4875 ns 4834 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4188 ns 3959 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61773 ns 61509.5 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10375 ns 10500 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 10541 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10709 ns 10250 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10584 ns 10250 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 433806 ns 431498.5 ns 1.01
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1209 ns 1062.5 ns 1.14
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1208 ns 1167 ns 1.04
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1334 ns 1417 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1333 ns 1208 ns 1.10
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18632 ns 18573 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 3958 ns 4000 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3770.5 ns 4000 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4250 ns 4209 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3750 ns 3750 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 111653 ns 111184 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57167 ns 57750 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46708 ns 38542 ns 1.21
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47042 ns 46583 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85000 ns 82208 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37778 ns 37503.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2021166.5 ns 2037645.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2091833 ns 2095625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090417 ns 1844375 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2037250 ns 2001375 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 197839 ns 196039 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144125 ns 145583 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 143687.5 ns 143584 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145875 ns 146458 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144542 ns 145000 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166264.5 ns 168190 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 815917 ns 1114291 ns 0.73
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1110583 ns 1150292 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1128458 ns 805500 ns 1.40
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1161791.5 ns 1122750 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 531966.5 ns 526921 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3834 ns 3292 ns 1.16
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3667 ns 3666 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4208 ns 4167 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3875 ns 3500 ns 1.11
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 72027 ns 72235.5 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9666 ns 10125 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9208 ns 8375 ns 1.10
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9667 ns 8792 ns 1.10
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8791 ns 8833 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 495388.5 ns 480020 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17250 ns 14875 ns 1.16
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15292 ns 15000 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17750 ns 17520.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 14875 ns 14583 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 54800 ns 53914 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213334 ns 214792 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213667 ns 214875 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215625 ns 214750 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213125 ns 226813 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 273384.5 ns 272785 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 625 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 625 ns 0.80
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 834 ns 917 ns 0.91
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 542 ns 459 ns 1.18
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17538 ns 17774 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1459 ns 1792 ns 0.81
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1625 ns 1417 ns 1.15
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1541 ns 1709 ns 0.90
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1584 ns 1417 ns 1.12
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 101749 ns 102929.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 6625 ns 7167 ns 0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5833 ns 5250 ns 1.11
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6000 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10541 ns 10000 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23308 ns 23666 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 230042 ns 225187.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228000 ns 237479.5 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229917 ns 229334 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215459 ns 226709 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 167869.5 ns 168739 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23769 ns 23839 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16625 ns 16792 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16645.5 ns 16833 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16916 ns 16958 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16542 ns 16750 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 160993.5 ns 161365 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 583542 ns 571458 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 582166 ns 576000 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 573083 ns 574041 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 578334 ns 571458 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 112908 ns 113559.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1416417 ns 1425375 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1413563 ns 1418875 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1420000 ns 1418958 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1427041.5 ns 1422750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 209512.5 ns 210833 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1074937.5 ns 1076645.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 961625 ns 934291 ns 1.03
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1349604 ns 1340187.5 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1275750 ns 1294270.5 ns 0.99
lenet(28, 28, 1, 64)/forward/GPU/CUDA 272786 ns 271656 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5988250 ns 5796417 ns 1.03
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4453229 ns 4651792 ns 0.96
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4954875 ns 4918209 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5751250 ns 5515938 ns 1.04
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1067705 ns 1071316.5 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 583 ns 0.93
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 583 ns 500 ns 1.17
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23552 ns 23948.5 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2209 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2125 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 171901 ns 169153 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4208.5 ns 3625 ns 1.16
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4417 ns 4084 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5042 ns 4687.5 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4166 ns 3709 ns 1.12
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 65093 ns 66303.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11292 ns 11270.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11292 ns 11417 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11875 ns 11625 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11417 ns 10667 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 448429 ns 456550 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7020.5 ns 6312.5 ns 1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7041 ns 6770.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7625 ns 7792 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6500 ns 7083 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52253 ns 52528 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16979.5 ns 18375 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17833 ns 17833 ns 1
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18875 ns 17791 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16875 ns 16833 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 301549.5 ns 301396 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 584 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32680 ns 32972 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8750 ns 9020.5 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8834 ns 8459 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9625 ns 9041 ns 1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8667 ns 8708 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 156693 ns 159042.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64125 ns 64542 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64291 ns 64895.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64458 ns 64292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64584 ns 64542 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111163 ns 110877 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 280625 ns 284875 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 274250 ns 297937.5 ns 0.92
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 278083 ns 282333 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 289292 ns 274104.5 ns 1.06
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 184761.5 ns 184904.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3374250 ns 3295541 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3022020.5 ns 2811062.5 ns 1.08
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3033167 ns 3016125 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 4059271.5 ns 3935209 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 577014 ns 572132 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7622583.5 ns 7478250 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7400875 ns 7348937.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7463083 ns 7339479.5 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8222208 ns 8212959 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1350413 ns 1367334 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 18744750 ns 18775625 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 19149375 ns 19121334 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 19037709 ns 19108667 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 15854917 ns 15653542 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23424208 ns 23560250 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33648791 ns 42472875 ns 0.79
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37255625 ns 37127771 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35462146 ns 34865500 ns 1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1854361 ns 1862818 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189507459 ns 188025167 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 163150563 ns 176960479.5 ns 0.92
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 151759708 ns 152823708 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 449307375 ns 441336000 ns 1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13915090 ns 13912250 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 290474792 ns 290589750 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 338390437.5 ns 276449542 ns 1.22
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 298728666 ns 296753875 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 400176437.5 ns 333259041 ns 1.20
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24666 ns 22875 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23062.5 ns 23333 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25125 ns 24125 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21833 ns 23542 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 95619.5 ns 98041.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103041 ns 103625 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103750 ns 135834 ns 0.76
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104584 ns 105084 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 104146 ns 103250 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 500114.5 ns 518052 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6042 ns 6209 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6500 ns 6500 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6667 ns 7041.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5958 ns 5959 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 68217 ns 70884 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14833 ns 15084 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16208 ns 15708 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16542 ns 16250 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15541.5 ns 14770.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 474515 ns 492747 ns 0.96
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3028583 ns 3001020.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2072250 ns 2085333 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2258958 ns 2274000 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4727250 ns 4550083 ns 1.04
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 581996.5 ns 589071 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23485750 ns 23511750 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18074583 ns 18279542 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17953667 ns 16979209 ns 1.06
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36188354.5 ns 35598583 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3102669 ns 3111231 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33313750 ns 33266500 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27588229.5 ns 28064750 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27385167 ns 27365500 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 42266896 ns 41824541.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 72125 ns 71750 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75625 ns 74021 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 75209 ns 74875 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72313 ns 73458 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102770.5 ns 104698 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217709 ns 314125.5 ns 0.69
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 264292 ns 212229 ns 1.25
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 208812 ns 323000 ns 0.65
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216750 ns 218042 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 548643 ns 559024 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11834 ns 11625 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13750 ns 12292 ns 1.12
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12208 ns 12500 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11791.5 ns 11875 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 71431.5 ns 73943 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26500 ns 26583 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27375 ns 26667 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 28000 ns 27708 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27167 ns 26666 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 474755 ns 493150 ns 0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12292 ns 12208 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13250 ns 12896 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13625 ns 13916 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12625 ns 12500 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 53420 ns 54608 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25708 ns 26125 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26084 ns 26000 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26375 ns 25916.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26209 ns 26000 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 305780 ns 315887.5 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 181833 ns 179208 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182750 ns 183145.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 182000 ns 183166 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 179750 ns 180125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 56584 ns 58575 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 582667 ns 582958.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 589020.5 ns 596541.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 585562.5 ns 583833 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582875 ns 582834 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 286509.5 ns 294599.5 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5958 ns 6292 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 6459 ns 1.08
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6917 ns 6750 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6167 ns 6041 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 71314 ns 72806 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14041.5 ns 14542 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15042 ns 13333 ns 1.13
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15334 ns 15667 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15042 ns 14333 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 465404.5 ns 482192.5 ns 0.97
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1163666 ns 1177728.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1608417 ns 1356208.5 ns 1.19
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1245958 ns 1250750 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1315062.5 ns 1317541 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301860.5 ns 301448 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4119833.5 ns 4117688 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4367812.5 ns 4491417 ns 0.97
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4633625 ns 4696854.5 ns 0.99
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 4681521 ns 4452542 ns 1.05
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1040008 ns 1051206.5 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1875 ns 1833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1916 ns 1875 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23628.5 ns 24165 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4875 ns 5000 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4875 ns 4958 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4917 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 188198 ns 194564.5 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5959 ns 6041 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6333 ns 6000 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6584 ns 6145.5 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5625 ns 5958 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 55698 ns 57313.5 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10958 ns 11979.5 ns 0.91
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11875 ns 11854.5 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11667 ns 11042 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11041.5 ns 11292 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 330993.5 ns 342366 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 375 ns 333 ns 1.13
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 334 ns 333 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 375 ns 333 ns 1.13
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23016 ns 23004 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2791 ns 3000 ns 0.93
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2750 ns 2750 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3083 ns 3000 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2792 ns 2750 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 158081 ns 159207 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12000 ns 11583 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12292 ns 11292 ns 1.09
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12979 ns 13437.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11500 ns 11708.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 56764.5 ns 57286.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25250 ns 25312.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25292 ns 25083 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25542 ns 25334 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25125 ns 25167 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 293131 ns 296722 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4209 ns 4208 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4250 ns 4167 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4250 ns 4167 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24851 ns 25099 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16084 ns 16125 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16084 ns 16041 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16250 ns 16166 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16125 ns 16042 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 193865.5 ns 199370.5 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 5833 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5833 ns 5833 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5833 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5833 ns 5833 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33648.5 ns 33986 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20937.5 ns 21083 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20875 ns 21125 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21375 ns 21208 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20833 ns 20667 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 175295.5 ns 176941.5 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 405354.5 ns 396792 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 383146 ns 354313 ns 1.08
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 487375 ns 489167 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 505333 ns 521584 ns 0.97
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67095 ns 66831 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 921500 ns 1005417 ns 0.92
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 879833.5 ns 876583 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1239500 ns 1235667 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 1413875 ns 1420854 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 190914 ns 191762.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80792 ns 80250 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 80625 ns 80209 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82416.5 ns 84167 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82208.5 ns 81125 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193084 ns 193433 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921166 ns 1916083 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1923375 ns 1933854 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1702792 ns 1917917 ns 0.89
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1942625 ns 1923708.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 397267 ns 409629 ns 0.97
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22298 ns 22197 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1834 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 171128.5 ns 170854.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6750 ns 6791 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7125 ns 6417 ns 1.11
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7750 ns 7375 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6583 ns 6959 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60207.5 ns 61202 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9334 ns 9291.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9458 ns 9166.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9458 ns 9375 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 9334 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 309332.5 ns 313492.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 118908083 ns 120748834 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173905459 ns 181703729 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148147000 ns 148437750 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104063562 ns 104851584 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5483006 ns 5474996 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 615077271 ns 616853125 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 556251208 ns 579539270.5 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 456191166.5 ns 451846854.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 775264354 ns 757165312.5 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 38217009 ns 34944567 ns 1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 651954834 ns 649889209 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 668816521 ns 688661771 ns 0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 584471208 ns 592710229 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 743364500 ns 741917708 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59041 ns 59750 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47167 ns 38959 ns 1.21
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 48042 ns 48000 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85604.5 ns 83416 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38577 ns 37459 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921792 ns 1922792 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1983375 ns 1985083 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1974021 ns 1978104 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1888041.5 ns 1893917 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 177270 ns 174160 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267667 ns 290625 ns 0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 269500 ns 266708 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 269000 ns 271521 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 265375 ns 268167 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 129439 ns 132776.5 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 602875 ns 657229.5 ns 0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 667625 ns 681187.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 589104 ns 691583 ns 0.85
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 696166.5 ns 597417 ns 1.17
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 698695 ns 713916 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2214416 ns 2243937 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2132916.5 ns 2191895.5 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2099687.5 ns 2213542 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2218542 ns 2180437.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 135139.5 ns 133381 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5496500 ns 5496875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5493084 ns 5583292 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5512750 ns 5498250 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5608375 ns 5492750.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 786813 ns 753967 ns 1.04
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 645084 ns 636833 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 646042 ns 644417 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 643042 ns 645333 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 645042 ns 637292 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47537 ns 46993.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1818666 ns 1826042 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1720625 ns 1667083 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1727375 ns 1726542 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2097625 ns 2105854.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 225809.5 ns 222295 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58458 ns 58500 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46958 ns 38708 ns 1.21
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47500 ns 47250 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85709 ns 84292 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29149.5 ns 28598 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2024312 ns 2031041 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2089792 ns 2099020.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2079417 ns 2091916.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2030812.5 ns 1856417 ns 1.09
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 192873 ns 190652 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13367875 ns 13391395.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12448375 ns 12453250 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12498688 ns 12557375.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15196500 ns 15140541 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 515450 ns 514312 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47301125 ns 47481750 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41737208 ns 41986250 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41031917 ns 40944792 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 59054000 ns 57945917 ns 1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3246636.5 ns 3259544 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 73864187.5 ns 96867229.5 ns 0.76
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 90734875 ns 91436187.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90710083 ns 90591917 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 99247604 ns 76381625 ns 1.30
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58667 ns 59083.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47292 ns 38750 ns 1.22
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47625 ns 47417 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85416.5 ns 84000 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47961 ns 46955 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1915542 ns 1925125 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1967250 ns 1979250 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1778666.5 ns 1970729.5 ns 0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1904791 ns 1897750 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195659 ns 191790.5 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32740 ns 32566 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6167 ns 6417 ns 0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6000 ns 6458 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6625 ns 6459 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6042 ns 6083 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 176130 ns 174123.5 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 250 ns 1.33
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31946 ns 31409 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2625 ns 2833 ns 0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2792 ns 2791 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2916 ns 2834 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2583 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 164970 ns 161269 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 286577604 ns 286258979.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 339468333 ns 346927270.5 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 314095271 ns 313997291.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 270924375 ns 270108416 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7117527 ns 7104986 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1001221667 ns 998016667 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 939877583 ns 959348209 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 851361917 ns 851652541.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1176703208 ns 1162498166 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 33887966 ns 33999768 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1311845770.5 ns 1672427541 ns 0.78
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1679371125 ns 1705785000 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1604290334 ns 1631619209 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1668435000 ns 1314128542 ns 1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1415333.5 ns 1406813 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1417520.5 ns 1416875 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1416104 ns 1459625 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1420146 ns 1407750 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128175 ns 127789 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5010542 ns 5022896 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5020291.5 ns 5051333 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5037500 ns 5029542 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5047042 ns 5031875 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 595594 ns 559312.5 ns 1.06
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 175229188 ns 169600250 ns 1.03
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 123461167 ns 180340396 ns 0.68
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 127594250 ns 130036124.5 ns 0.98
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 154552916.5 ns 169790708.5 ns 0.91
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4884050 ns 5056885.5 ns 0.97
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 667971584 ns 669854958 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 641402625 ns 604244667 ns 1.06
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 501342541 ns 501867209 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 657859875 ns 684062709 ns 0.96
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 15872908 ns 16520518 ns 0.96
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8987479.5 ns 8950666 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8781270.5 ns 8876958.5 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7857729 ns 7849458.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 10412374.5 ns 10185417 ns 1.02
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1592095 ns 1594436 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36150584 ns 36026541.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 36797500 ns 38047792 ns 0.97
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33192666.5 ns 33343417 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 40244625 ns 38792000 ns 1.04
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6455577 ns 6457988 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47417 ns 47417 ns 1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47584 ns 47375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47583 ns 47584 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47333 ns 47333 ns 1
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18534 ns 18535 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 52833.5 ns 50291 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50375 ns 50375 ns 1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50666 ns 50417 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50250 ns 50083 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 202850 ns 191873 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7459 ns 6458 ns 1.16
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7417 ns 6917 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7312.5 ns 7750 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7458.5 ns 6958 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 98661 ns 91345 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9792 ns 10458 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10125 ns 9916 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10542 ns 10084 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10250 ns 10208 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 555252.5 ns 527140.5 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6750 ns 5625 ns 1.20
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6042 ns 5917 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7208.5 ns 6958 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6542 ns 5750 ns 1.14
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 104446.5 ns 120543 ns 0.87
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13125 ns 13583 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12917 ns 13354.5 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13292 ns 13458 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13083 ns 13000 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 478181 ns 537999 ns 0.89
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1125 ns 1083 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32701 ns 32473 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8375 ns 7917 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8125 ns 7917 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 7959 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8083 ns 8167 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 206369.5 ns 206314.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23417 ns 23437.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23500 ns 23167 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23416 ns 23584 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23333 ns 23542 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18592 ns 18671 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52750 ns 52458 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 54709 ns 52541 ns 1.04
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52917 ns 53458 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52917 ns 52062.5 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 283991 ns 291832.5 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1399417 ns 1458937 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1396395.5 ns 1401583 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1396833 ns 1403833.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1449874.5 ns 1459708.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196187 ns 195968 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5003208 ns 5008771 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5005375 ns 5044104 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5023834 ns 5017250 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5050167 ns 5011916 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 585941 ns 599687 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3039563 ns 3061000 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2072875 ns 2086750 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2275208 ns 2304917 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4856479 ns 4539041 ns 1.07
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 583070 ns 581670 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24354562.5 ns 24376958 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18867354 ns 19122667 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18817521 ns 19181062.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37413770.5 ns 36163041 ns 1.03
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3176919 ns 3185287.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33990500 ns 34039875 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28382208.5 ns 28717291.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28070021 ns 28156000 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 42353875 ns 41614584 ns 1.02
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144782125 ns 144831583 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 142800542 ns 143542708 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 123809687.5 ns 124983229.5 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 168891563 ns 173618479 ns 0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22773536 ns 22558463 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1277305063 ns 1247182979 ns 1.02
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1180173271 ns 836595146 ns 1.41
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 757990666 ns 738893583 ns 1.03
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 688381500 ns 672803125 ns 1.02
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118470004 ns 118329511 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75042 ns 84666 ns 0.89
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73625 ns 73666 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 77166 ns 76146 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74708 ns 75688 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 220284.5 ns 240753.5 ns 0.91
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 285750 ns 287042 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 191208 ns 212354 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 192209 ns 296854 ns 0.65
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 286417 ns 284250 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1195118 ns 1238105 ns 0.97
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35568917 ns 35497979 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 35278833 ns 35870917 ns 0.98
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32149729 ns 32110833 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 41733750 ns 40961896 ns 1.02
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5841675.5 ns 5843453.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 148531084 ns 149169500 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 153045542 ns 155980437.5 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 136231750 ns 134845625 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 228329854.5 ns 287434667 ns 0.79
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34864707.5 ns 34879809 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 119094187.5 ns 121767709 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174236667 ns 181613625 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147985917 ns 148039291 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 107449375 ns 104612333.5 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5482351 ns 5485164 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 467600417 ns 472118833 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 465577292 ns 486130458.5 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 438034750 ns 440650208 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 759816229.5 ns 746192375 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 35154520.5 ns 32245076 ns 1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 709358854.5 ns 643396416 ns 1.10
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 655624271 ns 675303249.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 571617791 ns 575492166 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 869387791 ns 856961334 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1327250.5 ns 1312541 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 905875 ns 677667 ns 1.34
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 907750 ns 963459 ns 0.94
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2079042 ns 2093375 ns 0.99
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 578714.5 ns 580070.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2967333.5 ns 2966541.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2631479.5 ns 2496854 ns 1.05
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2620896 ns 2623959 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3771729 ns 3704083 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1755565 ns 1730505 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 6610917 ns 6656375 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 6496875 ns 6477624.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 6497437.5 ns 6431167 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 4521833 ns 4450479.5 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7375 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 5417 ns 1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6084 ns 6084 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10542 ns 9917 ns 1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25575 ns 25252 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212875 ns 212583 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229500 ns 229770.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221187.5 ns 220500 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 246625 ns 206083 ns 1.20
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 261769.5 ns 251646.5 ns 1.04
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 313730896 ns 301644020.5 ns 1.04
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 222537125 ns 280942354.5 ns 0.79
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 194707917 ns 189363792 ns 1.03
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 313279354 ns 305392479 ns 1.03
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7673155 ns 7676597 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1080950395.5 ns 1087372208.5 ns 0.99
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 899873458 ns 980974208 ns 0.92
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 834690333 ns 865965209 ns 0.96
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1180116917 ns 1158600916.5 ns 1.02
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26459206.5 ns 26533591 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5875 ns 5354.5 ns 1.10
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5417 ns 5375 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6250 ns 6917 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6084 ns 4958 ns 1.23
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 162725 ns 146657 ns 1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7375 ns 7395.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7084 ns 7375 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7750 ns 7250 ns 1.07
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7625 ns 7250 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 624677.5 ns 596011.5 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 666 ns 625 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 625 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 23758 ns 24031 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9542 ns 8917 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9291 ns 9708 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9584 ns 9583 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9209 ns 8833 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 225738 ns 216620.5 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 352000 ns 353333 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 352042 ns 352041 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 354604.5 ns 352666.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 353833 ns 352417 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21344 ns 21463 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 822291 ns 820625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 812479 ns 828917 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 824250 ns 774875 ns 1.06
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 831958 ns 778729 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 304872 ns 269469 ns 1.13
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 337167 ns 337187.5 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 343334 ns 313687.5 ns 1.09
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 446875 ns 444709 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 316354.5 ns 334500 ns 0.95
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18389 ns 17922 ns 1.03
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 695521 ns 689958 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 750792 ns 746333 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1026833 ns 1025042 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 688999.5 ns 694854.5 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 282579.5 ns 242950 ns 1.16
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 356667 ns 351417 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 354500 ns 327270.5 ns 1.08
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 421500 ns 414729.5 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 347042 ns 371750 ns 0.93
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22715 ns 22559 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 754229 ns 747208 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 753792 ns 749416 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1072417 ns 1069374.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 823125 ns 815937.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 256204.5 ns 224503 ns 1.14
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3583 ns 3708 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3500 ns 3625 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3708 ns 3750 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3667 ns 3291 ns 1.11
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17612 ns 17855 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4208 ns 4208 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4500 ns 4208 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4292 ns 4333 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4417 ns 4208 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 280326.5 ns 248489.5 ns 1.13
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4209 ns 3708 ns 1.14
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4333 ns 4167 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4291 ns 4791 ns 0.90
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4125 ns 3792 ns 1.09
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 232867.5 ns 203806 ns 1.14
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8291 ns 8667 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8250 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8562.5 ns 8458 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8667 ns 8667 ns 1
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1214158.5 ns 1166315.5 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203792 ns 204875 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 211375 ns 209750 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209083 ns 209834 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 202541 ns 200000 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34629 ns 34893 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 645167 ns 602917 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 623895.5 ns 628833 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 630084 ns 621584 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 633833 ns 592041 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 349768 ns 321942.5 ns 1.09
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 972062.5 ns 978791 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 937916.5 ns 937250.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 960125 ns 960250 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 1319708 ns 1307271 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/CUDA 208475 ns 207418 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4500166 ns 4504084 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4475687.5 ns 4619604.5 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4308250 ns 4294917 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 6508250 ns 6229292 ns 1.04
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 944786.5 ns 936037 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4084 ns 3354 ns 1.22
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3750 ns 3583 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4083 ns 4417 ns 0.92
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3542 ns 3333 ns 1.06
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 226002.5 ns 196464 ns 1.15
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7334 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7625 ns 7417 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7291 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7334 ns 6917 ns 1.06
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1008436 ns 985634 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1647479.5 ns 1640792 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1203104.5 ns 1171541.5 ns 1.03
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1378125 ns 1327125 ns 1.04
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2472896 ns 2384666 ns 1.04
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213582 ns 216205.5 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12309291 ns 12345499.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9565666 ns 9603042 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9280334 ns 9259895.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18216500 ns 18032958.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1940596 ns 1950941 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17356917 ns 17348083 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14358625 ns 14444583.5 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14329312.5 ns 14302167 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21175541 ns 21057645.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 133834 ns 87666.5 ns 1.53
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90000 ns 89562 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 93687 ns 90292 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 90750 ns 88875 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125997 ns 126565 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2019458 ns 2024000 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2029375 ns 2030958.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2029667 ns 1707583 ns 1.19
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2049458 ns 2030042 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1042357 ns 999913 ns 1.04
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 347333 ns 343750 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 349250 ns 326145.5 ns 1.07
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 394583 ns 396833 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 293978.5 ns 309896 ns 0.95
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16455.5 ns 16654 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 709041 ns 702666 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 741583.5 ns 733666 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 1022875 ns 1020166 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 644791 ns 652500 ns 0.99
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 197069.5 ns 190386.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7416 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5875 ns 5291 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6041 ns 6000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10583 ns 10041 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34401 ns 34743 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224416.5 ns 224334 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220375 ns 229333 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231250 ns 220959 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 236834 ns 206292 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 318034 ns 296926 ns 1.07
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3792 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3667 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 23219 ns 23083 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14375 ns 14416 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14375 ns 14209 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14417 ns 14292 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14167 ns 14458 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 484400.5 ns 448235 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 97417 ns 92854 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 94042 ns 99583 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 97959 ns 94542 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 95500 ns 96042 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125837 ns 125978 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1920250 ns 1920562.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1649417 ns 1914937.5 ns 0.86
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1923437 ns 1653792 ns 1.16
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1953916 ns 1928541 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 974936 ns 893203 ns 1.09
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 879729.5 ns 878750 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 832708 ns 800021 ns 1.04
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1229562.5 ns 1221729 ns 1.01
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 939583 ns 963792 ns 0.97
lenet(28, 28, 1, 32)/forward/GPU/CUDA 281248 ns 277692.5 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2831145.5 ns 2824834 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2527396 ns 2464958 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3353354.5 ns 3323271 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3411104.5 ns 3398958 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1661947.5 ns 1565101.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 14854.5 ns 17667 ns 0.84
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15583 ns 15458.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18792 ns 17250.5 ns 1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16000 ns 14645.5 ns 1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 144462 ns 142432.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 255958 ns 218209 ns 1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215583.5 ns 222958.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 257583 ns 216334 ns 1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 262500 ns 215062.5 ns 1.22
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 650445 ns 637432 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221375 ns 221145.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 220792 ns 222375 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 223083 ns 220917 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 220646 ns 220333 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 273454.5 ns 280530 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 559542 ns 510354 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 510542 ns 499375 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 507813 ns 500021 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 535208.5 ns 507041 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1396532 ns 1281236 ns 1.09
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 328770.5 ns 332250 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 336937 ns 316000 ns 1.07
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 370500 ns 364333 ns 1.02
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 299625 ns 323834 ns 0.93
batchedmm(16, Bsize=4)/forward/GPU/CUDA 17616 ns 17441 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 711834 ns 715833.5 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 732166.5 ns 735083 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 1024479.5 ns 1022959 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 657917 ns 667041 ns 0.99
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 200486.5 ns 193588.5 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18520.5 ns 18666 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19083 ns 17375 ns 1.10
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19625 ns 19167 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18396 ns 17083.5 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 147224 ns 147781 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213625 ns 212542 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221875 ns 214146 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221812.5 ns 213834 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 237333 ns 211354.5 ns 1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 951211 ns 877964 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4583 ns 4083 ns 1.12
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4417 ns 4291.5 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4917 ns 5375 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4437.5 ns 3958 ns 1.12
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 239868.5 ns 169898 ns 1.41
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10625 ns 10834 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10500 ns 10542 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10958 ns 10583 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10625 ns 10459 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1112681.5 ns 993411.5 ns 1.12
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3791 ns 3417 ns 1.11
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3541 ns 3167 ns 1.12
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4229.5 ns 4375 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3833 ns 3062.5 ns 1.25
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 252769 ns 203556.5 ns 1.24
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7334 ns 7791 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7792 ns 7458 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7916 ns 7250 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7437.5 ns 7541 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1116124.5 ns 1041955 ns 1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23341875 ns 23557729 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34053354.5 ns 43140979 ns 0.79
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37482854.5 ns 37880833 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35456625 ns 34954917 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1845777.5 ns 1859678 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184378291 ns 184630708 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 158584667 ns 172192624.5 ns 0.92
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146193479 ns 146314396 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 422496166.5 ns 415449708 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16510255 ns 16494786 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 426674167 ns 428781042 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 253893875 ns 259710791 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 232875895.5 ns 231751208 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 494805750 ns 484878833 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 184500 ns 183625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183458 ns 183375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185583 ns 184417 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183416.5 ns 182667 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 231684 ns 177771.5 ns 1.30
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 599042 ns 590604 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 586312.5 ns 588083 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 636833 ns 586792 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 641125 ns 586958 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1087543.5 ns 1015783.5 ns 1.07
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3842645.5 ns 3860917 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3643229 ns 3732375 ns 0.98
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3509333 ns 3478062.5 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 5524187.5 ns 5358854.5 ns 1.03
batchedmm(128, Bsize=512)/forward/GPU/CUDA 534809 ns 533317.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17462833 ns 17452375 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17328500.5 ns 17779209 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16632083 ns 16551750 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 23474479.5 ns 22184000 ns 1.06
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2613903 ns 2614491.5 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 584 ns 584 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 666 ns 625 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32551 ns 32765 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9375 ns 9625 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8625 ns 9542 ns 0.90
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9792 ns 9625 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9354.5 ns 8917 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 264963 ns 263711.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 500529917 ns 501494042 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 429131021 ns 411555459 ns 1.04
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 390085458 ns 374781084 ns 1.04
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 680776812.5 ns 672198042 ns 1.01
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12474289.5 ns 12477100 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 2050021916.5 ns 2044775145.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1635602292 ns 1660536667 ns 0.98
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1501725478.5 ns 1495631604 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2237822875 ns 2221523375 ns 1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49165291 ns 49258137.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1648791.5 ns 1643291 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1195792 ns 1172917 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1379625 ns 1391041.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2436187.5 ns 2338333 ns 1.04
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215012 ns 215612.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12725833.5 ns 12698542 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9944041.5 ns 9998999.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9667395.5 ns 9717041 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18594104.5 ns 18433792 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2038696 ns 2039696 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17722000 ns 17679687.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14694125 ns 14770854.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14557833 ns 14602583.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21533833 ns 21327625 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26291 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26750 ns 26250 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26292 ns 26208 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23955 ns 24225 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66833 ns 67250 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66625 ns 66834 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66958 ns 68166 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66833 ns 66792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 403690.5 ns 378162.5 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 202791 ns 203125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209375 ns 208500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209667 ns 208666 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200708 ns 200125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26177 ns 26005 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 612146 ns 646625 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 622334 ns 628813 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 680520.5 ns 669895.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 634750 ns 580791.5 ns 1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 350618 ns 311381 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 650500 ns 651667 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 542145.5 ns 638666 ns 0.85
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 634666 ns 647417 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 679459 ns 653083.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131917 ns 131397 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2229542 ns 2243375 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2231250 ns 2314937.5 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2251687.5 ns 2249625 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2330333 ns 2235375 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1238942 ns 1114755 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16854 ns 18291 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19500 ns 17500 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19791.5 ns 20917 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17750 ns 18292 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 144506 ns 143094 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 230625 ns 223500 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 260583 ns 226042 ns 1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 261125 ns 262917 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 265583.5 ns 230125 ns 1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1064679 ns 943015 ns 1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 666 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23448 ns 23380 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10125 ns 10104.5 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9792 ns 10166 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10000 ns 10000 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9979 ns 9583 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 257505.5 ns 254915.5 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6125 ns 5084 ns 1.20
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5625 ns 5375 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6666 ns 6791 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6084 ns 5250 ns 1.16
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 233944.5 ns 190346.5 ns 1.23
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7416 ns 7250 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7334 ns 7125 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7834 ns 7250 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7417 ns 7083 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 800597 ns 735734 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2209 ns 2167 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2292 ns 2208 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2208 ns 2209 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2250 ns 2417 ns 0.93
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17989 ns 18111 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6541.5 ns 6750 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6542 ns 6375 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 7125 ns 6625 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6750 ns 6625 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 330052 ns 306022.5 ns 1.08
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 751958.5 ns 751583.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 746604.5 ns 748875 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749167 ns 746812.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 748959 ns 748500 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21090 ns 21064 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 791292 ns 791834 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 792333 ns 788667 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 773291 ns 786646.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 792291.5 ns 792479 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 299003.5 ns 294710 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7417 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5917 ns 5208 ns 1.14
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6083 ns 6000 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10791 ns 10084 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33088.5 ns 33108.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 233333 ns 228645.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229479 ns 231416 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 269542 ns 271625 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220958 ns 225958 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 359587 ns 351410 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10625 ns 10292 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10375 ns 10084 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10958 ns 11166 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10959 ns 10000 ns 1.10
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 249563.5 ns 209596.5 ns 1.19
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25042 ns 24709 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24625 ns 24333 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25375 ns 24291 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25250 ns 24437.5 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1114585 ns 1037550 ns 1.07
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106488708 ns 107199542 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 117008645.5 ns 126347334 ns 0.93
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120350584 ns 120468625 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 118085396 ns 117762042 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2661446 ns 2637816 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393399750 ns 393813416 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 368428125 ns 380007916 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 359138458 ns 355873375 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 486814000 ns 484550250 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15211152 ns 15152772.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 759103375 ns 939763875 ns 0.81
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 755373708 ns 777743792 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 744752604 ns 745742833 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 959286729.5 ns 767071771.5 ns 1.25
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6896 ns 7167 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7791 ns 6833 ns 1.14
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8250 ns 8458 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7583 ns 7562.5 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 240721 ns 228024 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14458 ns 14250 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14208.5 ns 14042 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14750 ns 13875 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14312.5 ns 13333 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1072384 ns 1000779 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6292 ns 6167 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6125 ns 6125 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7291 ns 8250 ns 0.88
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6458 ns 5604.5 ns 1.15
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 234548 ns 214266.5 ns 1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12584 ns 12417 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12625 ns 12542 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12959 ns 12875 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12583 ns 12541 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 784420 ns 724930 ns 1.08
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 347708 ns 349208 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 386916.5 ns 326145.5 ns 1.19
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 398834 ns 393333 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 292375 ns 314271 ns 0.93
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16947 ns 17228 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 708249.5 ns 706500 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 746000 ns 739437.5 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 1025229 ns 1020354 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 652416.5 ns 658541 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 199954 ns 198297 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23200 ns 23935.5 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6458 ns 6500 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6417 ns 6584 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6750 ns 6584 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6542 ns 6250 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 238715 ns 240134 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5958 ns 5875 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5917 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6000 ns 5917 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5917 ns 5834 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24219 ns 24721 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21250 ns 21500 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 20875 ns 21333 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21417 ns 21292 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21896 ns 21208 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 261648 ns 262379.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145458 ns 144229.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 147521 ns 144042 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 147770.5 ns 147292 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 147458.5 ns 145833 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167051 ns 167351 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1322500 ns 1320395.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1320041 ns 1358771 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1325833 ns 1324084 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1391458 ns 1329333.5 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1346456 ns 1268788 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22250 ns 24083 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24750 ns 22375 ns 1.11
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23416 ns 25104.5 ns 0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22396 ns 21917 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 353387 ns 280502 ns 1.26
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 178709 ns 131646 ns 1.36
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 118687.5 ns 121334 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 127459 ns 177687.5 ns 0.72
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 134041.5 ns 130209 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1464281 ns 1380349 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22942 ns 23199 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6459 ns 6708 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6500 ns 7083 ns 0.92
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6666 ns 6708 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6750 ns 6083 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 255510 ns 258254.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5000 ns 5042 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4729.5 ns 4500 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5292 ns 4917 ns 1.08
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4709 ns 4917 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 256450 ns 243109 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10042 ns 10375 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10167 ns 10042 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10417 ns 10125 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10125 ns 10167 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1348843.5 ns 1338362 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1584 ns 1667 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1667 ns 1625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1667 ns 1542 ns 1.08
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22876 ns 23629 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5625 ns 5875 ns 0.96
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5625 ns 5666 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6041 ns 5958 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5708 ns 5625 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 272214.5 ns 278503 ns 0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6888875 ns 6825854.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6384792 ns 6429125 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6514708.5 ns 6541187.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7555583 ns 7656375 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214320 ns 215102 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24087271 ns 24080834 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21278062.5 ns 21338208 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21040583 ns 21079333 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29921333 ns 29660375 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2106395 ns 2111008 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37396292 ns 48564000 ns 0.77
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45619104.5 ns 45595770.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45717854 ns 45721854 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49514208 ns 38038271 ns 1.30
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6208 ns 5687.5 ns 1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6333 ns 6041 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6459 ns 6917 ns 0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6083 ns 5375 ns 1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 236136 ns 239823 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9125 ns 8291 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8666 ns 8500 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8416 ns 8750 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8583 ns 8750 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1059780 ns 1069933 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1497208 ns 1555021 ns 0.96
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1271146 ns 1235375.5 ns 1.03
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1623333 ns 1618375 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2143312.5 ns 2095209 ns 1.02
lenet(28, 28, 1, 128)/forward/GPU/CUDA 273613.5 ns 285020 ns 0.96
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7900125 ns 7898542 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6605479 ns 6630645.5 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7156416.5 ns 7200958 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10528062.5 ns 10372854.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1850752 ns 1904820 ns 0.97
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 343000 ns 342000 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 349166.5 ns 323833 ns 1.08
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 383250 ns 382208 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 325438 ns 342042 ns 0.95
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46572 ns 43080 ns 1.08
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 746124.5 ns 725958 ns 1.03
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 795499.5 ns 782938 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1076208.5 ns 1067750 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 753291.5 ns 737041.5 ns 1.02
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 309766 ns 314201.5 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397375 ns 397583 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 287916 ns 211916 ns 1.36
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288000 ns 288208 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 749125 ns 750834 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44192 ns 44587.5 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 666145.5 ns 670500 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 531062.5 ns 470708 ns 1.13
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 529625 ns 531792 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 975062.5 ns 974083 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 188202 ns 192970 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 646708 ns 651646 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 543166.5 ns 644458.5 ns 0.84
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 654229 ns 659271 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 659479 ns 645333 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132313.5 ns 132814 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2450208 ns 2440750 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2447833 ns 2525916.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2404020.5 ns 2439124.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2562667 ns 2464750 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1598744 ns 1349058.5 ns 1.19
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 347208 ns 344292 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 347542 ns 326104 ns 1.07
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 400125 ns 393875 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 291604 ns 312896 ns 0.93
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16522 ns 16925 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 706875 ns 709938 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 734333 ns 739917 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 1028542 ns 1021708 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 647750 ns 650083.5 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 199294.5 ns 202873.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458584 ns 1458625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1498042 ns 1490666 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1499666 ns 1498417 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1444167 ns 1436416 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40454 ns 41016 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5120438 ns 5105458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5292292 ns 5294583 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5286000 ns 5292167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5017937.5 ns 5007208 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195965.5 ns 201135.5 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3667 ns 3750 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3667 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 32802 ns 33479.5 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15125 ns 15292 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15292 ns 15125 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15417 ns 15291 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14875 ns 15042 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 372915.5 ns 381756.5 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 70917 ns 71209 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71250 ns 71250 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 70916 ns 71125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71375 ns 70062.5 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 112608 ns 114111 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 317750 ns 318250 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 318417 ns 329625 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 318375 ns 318708 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 327667 ns 317958 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 192232 ns 197229.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1083 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23208 ns 24163 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8000 ns 8167 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8042 ns 8041 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8250 ns 8667 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8250 ns 7625 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 259321 ns 264271.5 ns 0.98
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 468417 ns 464166.5 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 479458 ns 448167 ns 1.07
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 555416 ns 553459 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 544792 ns 548917 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/CUDA 128776.5 ns 129241.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1386166.5 ns 1380229 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1391187.5 ns 1393229 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1623687.5 ns 1619541 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 1644333.5 ns 1590270.5 ns 1.03
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 275740 ns 277974 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 416 ns 0.80
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31924 ns 32417 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 5958 ns 6375 ns 0.93
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6167 ns 6500 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6459 ns 6542 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6166 ns 5958 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 262594 ns 267135 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1733625 ns 1723834 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1722729.5 ns 1731042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1729958 ns 1722458 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1727000 ns 1727375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168805 ns 168945.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4353667 ns 4366646 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4366916.5 ns 4396958.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4362042 ns 4374416.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4429395.5 ns 4349500 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1264129.5 ns 1192401 ns 1.06
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6959 ns 6750 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6708 ns 6541 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7000 ns 7292 ns 0.96
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6833 ns 6542 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20795 ns 20406 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 51500 ns 81771 ns 0.63
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 38042 ns 49083 ns 0.78
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 47209 ns 72271 ns 0.65
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 48666.5 ns 51334 ns 0.95
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 295172.5 ns 213340.5 ns 1.38
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 355084 ns 354167 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 350583 ns 329541.5 ns 1.06
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 423208.5 ns 401083 ns 1.06
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 295000 ns 321771 ns 0.92
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18329 ns 18865 ns 0.97
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 718562.5 ns 722646.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 744125 ns 740500 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 1031500 ns 1030625 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 672625 ns 673875 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 347666.5 ns 350549.5 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75042 ns 75250 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75250 ns 75250 ns 1
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75333 ns 75458 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75584 ns 75042 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46603 ns 47823 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 324708 ns 324625 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 327334 ns 341667 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 324375 ns 324250 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 334062.5 ns 330833 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 207370 ns 216202 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1485208 ns 1485500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1526250 ns 1517334 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1526625 ns 1526000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1467250 ns 1463167 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51906 ns 53576 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5116396.5 ns 5124354.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5284312.5 ns 5278542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5277167 ns 5287917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5025562.5 ns 4986958 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 203896.5 ns 209445 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28333 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28333 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28625 ns 28208 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28334 ns 28291 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24422 ns 25452 ns 0.96
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66250 ns 66333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66250 ns 66250 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66250 ns 66250 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66375 ns 66333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 519781.5 ns 539628 ns 0.96
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1501250 ns 1483687.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1125791 ns 859791.5 ns 1.31
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1125104.5 ns 1143208 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2259459 ns 2247229.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 571991 ns 585407 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3070000 ns 3085000 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2775000 ns 2591208 ns 1.07
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2736500 ns 2737895.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3899292 ns 3816250 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2055229 ns 2035890 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8838896 ns 8818187.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8809083.5 ns 8953500 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8782709 ns 8776854 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 6483958.5 ns 6365041 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80583 ns 80791 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81334 ns 79875 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 83645.5 ns 82792 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 136500 ns 80708 ns 1.69
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192157 ns 194256.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2012958 ns 2013375 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2009583 ns 1748958 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2015916.5 ns 2018500 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2051000 ns 2022750 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 803108 ns 809328 ns 0.99

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.