-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: make
LossFunctions
an optional dep (#976)
* refactor: make LossFunctions an optional dep * feat: add custom derivative fast paths * test: more tests got fixed
- Loading branch information
Showing
5 changed files
with
180 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "Lux" | ||
uuid = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.1.0" | ||
version = "1.2.0-DEV" | ||
|
||
[deps] | ||
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" | ||
|
@@ -18,7 +18,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" | |
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" | ||
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" | ||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" | ||
LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" | ||
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623" | ||
LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11" | ||
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" | ||
|
@@ -43,6 +42,7 @@ ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66" | |
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" | ||
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" | ||
FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e" | ||
LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" | ||
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" | ||
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" | ||
NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b" | ||
|
@@ -55,6 +55,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" | |
LuxComponentArraysExt = "ComponentArrays" | ||
LuxEnzymeExt = "Enzyme" | ||
LuxFluxExt = "Flux" | ||
LuxLossFunctionsExt = "LossFunctions" | ||
LuxMLUtilsExt = "MLUtils" | ||
LuxMPIExt = "MPI" | ||
LuxMPINCCLExt = ["CUDA", "MPI", "NCCL"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
module LuxLossFunctionsExt | ||
|
||
using ArrayInterface: fast_scalar_indexing | ||
using ChainRulesCore: ChainRulesCore, NoTangent, @thunk | ||
using EnzymeCore: EnzymeCore, EnzymeRules | ||
using FastClosures: @closure | ||
using LossFunctions: LossFunctions | ||
using Statistics: mean | ||
|
||
using Lux: Lux, LossFunctionImpl | ||
|
||
const CRC = ChainRulesCore | ||
|
||
function LossFunctionImpl.fused_agg( | ||
::typeof(mean), lfn::LossFunctions.Traits.Loss, x::AbstractArray, y::AbstractArray) | ||
return LossFunctionImpl.fused_agg(sum, lfn, x, y) / length(x) | ||
end | ||
|
||
function LossFunctionImpl.fused_agg( | ||
::typeof(sum), lfn::LossFunctions.Traits.Loss, x::Number, y::Number) | ||
return lfn(x, y) | ||
end | ||
function LossFunctionImpl.fused_agg( | ||
::typeof(sum), lfn::LossFunctions.Traits.Loss, x::AbstractArray, y::AbstractArray) | ||
fast_scalar_indexing(x) && fast_scalar_indexing(y) && return sum(lfn, x, y) | ||
return sum(lfn.(x, y)) | ||
end | ||
|
||
function CRC.rrule( | ||
::CRC.RuleConfig{>:CRC.HasReverseMode}, | ||
::typeof(LossFunctionImpl.fused_agg), ::typeof(sum), | ||
lfn::LossFunctions.Traits.Loss, x, y) | ||
∇fused_agg = @closure Δ -> begin | ||
∂x = @thunk LossFunctions.deriv.(Ref(lfn), x, y) .* Δ | ||
return NoTangent(), NoTangent(), NoTangent(), ∂x, NoTangent() | ||
end | ||
return LossFunctionImpl.fused_agg(sum, lfn, x, y), ∇fused_agg | ||
end | ||
|
||
function EnzymeRules.augmented_primal( | ||
cfg::EnzymeRules.RevConfigWidth{1}, | ||
func::EnzymeCore.Const{typeof(LossFunctionImpl.fused_agg)}, | ||
::Type{<:EnzymeCore.Active}, agg_f::EnzymeCore.Const{typeof(sum)}, | ||
lfn::EnzymeCore.Const{<:LossFunctions.Traits.Loss}, | ||
x::EnzymeCore.Annotation{<:AbstractArray}, y::EnzymeCore.Const) | ||
primal = EnzymeRules.needs_primal(cfg) ? func.val(agg_f.val, lfn.val, x.val, y.val) : | ||
nothing | ||
|
||
cache_x = EnzymeRules.overwritten(cfg)[4] ? copy(x.val) : nothing | ||
cache_y = EnzymeRules.overwritten(cfg)[5] ? copy(y.val) : nothing | ||
|
||
return EnzymeRules.AugmentedReturn(primal, nothing, (cache_x, cache_y)) | ||
end | ||
|
||
function EnzymeRules.reverse( | ||
cfg::EnzymeRules.RevConfigWidth{1}, | ||
::EnzymeCore.Const{typeof(LossFunctionImpl.fused_agg)}, | ||
dret::EnzymeCore.Active, (cache_x, cache_y), agg_f::EnzymeCore.Const{typeof(sum)}, | ||
lfn::EnzymeCore.Const{<:LossFunctions.Traits.Loss}, | ||
x::EnzymeCore.Annotation{<:AbstractArray}, y::EnzymeCore.Const) | ||
EnzymeRules.overwritten(cfg)[4] || (cache_x = x.val) | ||
EnzymeRules.overwritten(cfg)[5] || (cache_y = y.val) | ||
|
||
if !(typeof(x) <: EnzymeCore.Const) | ||
@. x.dval = LossFunctions.deriv(lfn.val, cache_x, cache_y) * dret.val | ||
end | ||
|
||
return ntuple(Returns(nothing), 4) | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77eb5fb
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s)
414958
ns411750
ns1.01
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s)
322541
ns322271
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s)
323167
ns323042
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s)
739562.5
ns749375
ns0.99
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA
44543
ns43905
ns1.01
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s)
1335729
ns1306583
ns1.02
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s)
485000
ns465625
ns1.04
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s)
14073833
ns13617333
ns1.03
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s)
2211312.5
ns2245750
ns0.98
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA
194175
ns192831
ns1.01
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s)
1374959
ns1394875
ns0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s)
596188
ns634729.5
ns0.94
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s)
13290875.5
ns14050875
ns0.95
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s)
2199270.5
ns2238000
ns0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1665666
ns1661542
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1186833.5
ns1196103.5
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1536854.5
ns1534187.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2912062.5
ns3005667
ns0.97
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213313
ns209529
ns1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12145187.5
ns12111521
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9486083
ns9554687
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9213083
ns9247000
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18563708
ns18626583
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1921274.5
ns1910271
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17291000
ns17307250
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14310062.5
ns14377958
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14535333
ns14526875
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21812208
ns21836458.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
250754270.5
ns250439041.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174424541
ns174592521
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
115532521
ns115955208.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
446573667
ns447243084
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5489738
ns5470843
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1222307709
ns1228722500
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
543403209
ns543561875
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
832977124.5
ns830623396.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1653507000
ns1628878000
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34972271
ns38000637
ns0.92
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1142743917
ns1136994583
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
686139667
ns679379084
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1325824667
ns1328113771
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1748793708.5
ns1733752146
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
1120083
ns1103375
ns1.02
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
820062
ns823209
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
3738667
ns3578479
ns1.04
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
785458.5
ns786500
ns1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA
280004
ns266091.5
ns1.05
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2992209
ns2986021
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2457292
ns2426000
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
10152708
ns10461250
ns0.97
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3200812.5
ns3150042
ns1.02
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1093024
ns1055864
ns1.04
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
2350792
ns2335042
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1546500
ns1537708
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1703875
ns1740000
ns0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
4314041
ns4348437.5
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214178
ns212286
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
20293542
ns20266645.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
17667520.5
ns17701209
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
18215687.5
ns17495416
ns1.04
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
26742770.5
ns26797000
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1989179
ns1973706
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
44338833
ns44317750
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
29803125
ns42027646
ns0.71
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
41253750.5
ns41325000
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
49627062.5
ns47734917
ns1.04
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
4666292
ns4664854
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2867729.5
ns2868521.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
3027042
ns3015958
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
8637375
ns8658937.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
512379
ns516555
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
40744375
ns40579000.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
34861000
ns34830104
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
34130125
ns34148292
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
53656458
ns53661812
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3039460
ns2969951
ns1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
109854709
ns109640958
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
60211500
ns84133666
ns0.72
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
244742208.5
ns255828791
ns0.96
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
100222291.5
ns96388416
ns1.04
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
270538750
ns270215792
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
187102791.5
ns186630271
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
128131083.5
ns128172709
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
496544584
ns489605542
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7095920
ns7104246
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1493043979
ns1502664042
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
820794750
ns821183792
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
1089880791.5
ns1092397958.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
2057983854
ns2032173187.5
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
33958491
ns33798333
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
2031772896
ns2027767896
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1169902125
ns1563910958
ns0.75
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
2031263062.5
ns2210346833.5
ns0.92
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
2621950583
ns2560629834
ns1.02
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
2080791
ns2006833
ns1.04
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1266458.5
ns1257333
ns1.01
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
7120666.5
ns7451041.5
ns0.96
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2469583
ns2470458
ns1.00
lenet(28, 28, 1, 128)/forward/GPU/CUDA
271494.5
ns275531
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
9645292
ns9463416
ns1.02
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6578875
ns6552500
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
23723729.5
ns25529541
ns0.93
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
11743000
ns11734125
ns1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1108810
ns1130415
ns0.98
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
378620500
ns380676854.5
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
148222625
ns145328000
ns1.02
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
232625416.5
ns243564083
ns0.96
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
452981958.5
ns452336354.5
ns1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4877366.5
ns4879283
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
1151789959
ns1156932333
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
608267042
ns487570458
ns1.25
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
958784875
ns973572458
ns0.98
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
1398102250
ns1399439834
ns1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
17573518
ns16976929
ns1.04
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1044000
ns1062687.5
ns0.98
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
966666.5
ns971124.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
5483500
ns6269583
ns0.87
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1369000
ns1393375
ns0.98
lenet(28, 28, 1, 64)/forward/GPU/CUDA
277684.5
ns277704.5
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
6395583
ns6494541.5
ns0.98
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4649000
ns4635437.5
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
18457937.5
ns19450479
ns0.95
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
6087375
ns6080229
ns1.00
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1153126
ns1148981
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
70616187.5
ns70442208
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34338895.5
ns35305229
ns0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
39546146
ns39532604
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132480333
ns132574604
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1837859.5
ns1848251
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
354650895.5
ns356785937.5
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
158687854
ns159371854
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
253835396.5
ns254893688
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
535065979.5
ns535009020.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16493787
ns16489529.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
394789208
ns395707667
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
245506292
ns245564417
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
682534167
ns652089584
ns1.05
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
711436834
ns712574333
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
1186860667
ns1191762375
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
435266750
ns434009729.5
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
628427791
ns631038834
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
1780484229
ns1771033395.5
ns1.01
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12477417
ns12471861
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
3652621271
ns3670803208.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1639329875
ns1633483458
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
2709465041
ns2737701958
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
5075123916
ns5038709417
ns1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49797376
ns49641386
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3423958
ns3412146
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2097249.5
ns2094750
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2534854
ns2533833.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6018625
ns6034292
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
580639
ns586721
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
25949208
ns26096750.5
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
20274833
ns20315791.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19561271
ns19312917
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
39224583
ns39366625
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2980196
ns2989473.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
55399562.5
ns54095229
ns1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28378292
ns28393083
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
172128937.5
ns177757792
ns0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
45636375
ns45278750
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1783542
ns1778208
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1197959
ns1204708
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1577021.5
ns1564000
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
3027083.5
ns3038771
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
218302
ns217944
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12541854.5
ns12531437.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9966458
ns9964292
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9641875
ns9707042
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18982208
ns18974500
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1943759
ns1963028.5
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17642208
ns17644270.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14745479
ns14745500
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14622083
ns14639333
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
22196749.5
ns22173792
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
70503791.5
ns70409562
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34154375
ns34786542
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
39724625.5
ns39571499.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
133426312.5
ns132610521
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1855504
ns1837717
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
357508542
ns360588187.5
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
236762959
ns237608334
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
305563667
ns299913354
ns1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
731068500
ns725805833
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13898567
ns13956738
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
418493479
ns418949812.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
253429500
ns251360792
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
696829083.5
ns712732021
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
717012750
ns717284542
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
1657812.5
ns1912041.5
ns0.87
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
1559500
ns1579125
ns0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
1547979
ns1549791.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
2615500
ns2657625
ns0.98
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
579410
ns573525
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
8948521
ns9220000
ns0.97
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
5918125
ns5936166
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
30404791
ns31895937.5
ns0.95
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
10061562
ns10214937.5
ns0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1389319
ns1399984.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
22293791
ns22182333.5
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
19118125
ns19138291.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
50278833
ns52527562.5
ns0.96
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
19441208.5
ns18888042
ns1.03
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s)
687479
ns791291.5
ns0.87
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s)
71083
ns69958.5
ns1.02
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s)
1021000
ns997167
ns1.02
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s)
725458.5
ns724499.5
ns1.00
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA
48336
ns48324
ns1.00
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s)
1568500
ns1508042
ns1.04
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s)
284021
ns320291
ns0.89
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s)
1426229
ns1445145.5
ns0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s)
2289417
ns2258458.5
ns1.01
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA
213525.5
ns216350
ns0.99
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s)
1518000
ns1537083
ns0.99
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s)
446709
ns428792
ns1.04
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s)
1398833.5
ns1444584
ns0.97
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s)
2227979
ns2250333
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3424312.5
ns3421750
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2076145.5
ns2084312.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2517375
ns2519375.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6002625
ns6015021
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
580319
ns584297
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24064958.5
ns24071521.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18099937.5
ns18050833
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17179812.5
ns17227375
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
37498749.5
ns37583145.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2895115
ns2895440
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
53787500
ns52599188
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27724333.5
ns27644250
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
165600125
ns170611917
ns0.97
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
44506604
ns44514250
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
250628729
ns250102292
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174546375
ns174510104
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
115593979.5
ns115645729
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
447286479
ns448140124.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5483866.5
ns5446378
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1100854958
ns1105120833
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
467966979
ns467780729.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
825353979.5
ns825455520.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1759896083
ns1753431125
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
32267946
ns35149612
ns0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1018635708.5
ns1021983312.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
665400833
ns662517187.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1204699750
ns1286071167
ns0.94
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1733389813
ns1721665437.5
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1226521
ns1312041
ns0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
961167
ns928625
ns1.04
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
918083
ns903208
ns1.02
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2051083.5
ns2032416
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
578415
ns575428
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
5660417
ns5922771
ns0.96
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2618917
ns2615500
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
23019583
ns24427083.5
ns0.94
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
7086333
ns7104916.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1349133
ns1363516
ns0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
9707250
ns9705958.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
6502604.5
ns6499000
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
30901167
ns31929750
ns0.97
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
7612917
ns7614042
ns1.00
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s)
383916
ns483291
ns0.79
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s)
31791
ns31750
ns1.00
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s)
2087375
ns1795375
ns1.16
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s)
91083
ns91542
ns0.99
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA
28712
ns28996
ns0.99
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s)
406208
ns392958
ns1.03
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s)
175875
ns175542
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s)
4346812.5
ns4708417
ns0.92
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s)
272959
ns273000
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA
216512
ns224707.5
ns0.96
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s)
678958
ns666333
ns1.02
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s)
442584
ns442250
ns1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s)
4679166
ns4499167
ns1.04
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s)
543542
ns510979.5
ns1.06
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s)
329792
ns430437.5
ns0.77
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s)
13125
ns13583
ns0.97
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s)
603208
ns709208
ns0.85
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s)
54708
ns52584
ns1.04
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA
27935
ns29296
ns0.95
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s)
354458
ns337250
ns1.05
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s)
25792
ns26375
ns0.98
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s)
719333
ns484812.5
ns1.48
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s)
151792
ns151333
ns1.00
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA
206354.5
ns213308.5
ns0.97
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s)
370041
ns352521
ns1.05
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s)
45958
ns45792
ns1.00
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s)
469708
ns487125
ns0.96
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s)
151167
ns151000
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
600144917
ns603223875
ns0.99
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
239512791.5
ns239241354
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
368675770.5
ns377713896
ns0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
878611917
ns872019458
ns1.01
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7673480
ns7676104.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
2001190937.5
ns2005520125
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
950953500.5
ns947653916.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
1611271729.5
ns1551514604.5
ns1.04
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
2652863625
ns2653038416
ns1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
27099744
ns27180094
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s)
532625
ns525604
ns1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s)
175791
ns168333
ns1.04
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s)
1765937
ns1740625
ns1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s)
873854
ns875541
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA
48010
ns47837
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s)
1862104.5
ns1943750
ns0.96
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s)
1105875
ns1100208
ns1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s)
14941916
ns14661875
ns1.02
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s)
2753625
ns2836709
ns0.97
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA
222255
ns232330
ns0.96
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s)
2909834
ns2974229
ns0.98
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s)
2231271
ns2208583.5
ns1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s)
15268500
ns15024229.5
ns1.02
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s)
3879541.5
ns3751750
ns1.03
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1471375
ns1602291.5
ns0.92
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1256917
ns1221084
ns1.03
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1257021
ns1264750
ns0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2211792
ns2362750
ns0.94
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
567447
ns576709
ns0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
5894166
ns5931125
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2856229.5
ns2866334
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
24506146
ns25035834
ns0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
7294791
ns6650208
ns1.10
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
1317261
ns1379411
ns0.95
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
11665375
ns11605146
ns1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
8766624.5
ns8767458
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
34961958
ns35255000
ns0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
9529500
ns9570000.5
ns1.00
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s)
2959
ns2541
ns1.16
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s)
2542
ns2292
ns1.11
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s)
2959
ns3000
ns0.99
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s)
2520.5
ns2333
ns1.08
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA
24587
ns25379.5
ns0.97
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s)
7167
ns7125
ns1.01
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s)
7084
ns7083
ns1.00
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s)
7458
ns7375
ns1.01
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s)
7167
ns7270.5
ns0.99
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA
185970
ns193729.5
ns0.96
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s)
8208
ns8334
ns0.98
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s)
8375
ns8500
ns0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s)
8416
ns8417
ns1.00
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s)
6000
ns6084
ns0.99
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s)
10208
ns10375.5
ns0.98
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s)
14937.5
ns14916
ns1.00
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s)
10916
ns11854
ns0.92
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s)
8833
ns7625
ns1.16
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA
24920
ns25646
ns0.97
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s)
21542
ns21708
ns0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s)
21625
ns21500
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s)
21916
ns21750
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s)
21916.5
ns21875
ns1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA
195496
ns203851
ns0.96
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s)
53500
ns53417
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s)
56875
ns56583.5
ns1.01
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s)
53625
ns53583.5
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s)
55083
ns51333
ns1.07
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s)
28541
ns26895.5
ns1.06
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s)
28437.5
ns28333.5
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s)
28250
ns29000
ns0.97
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s)
46083
ns48291
ns0.95
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA
25773
ns26739
ns0.96
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s)
224458
ns220875
ns1.02
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s)
44229.5
ns44583
ns0.99
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s)
4410250
ns4132667
ns1.07
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s)
145625
ns145458
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA
167043
ns172310
ns0.97
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s)
242125
ns237312.5
ns1.02
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s)
68875
ns68625
ns1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s)
4299458
ns4360708
ns0.99
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s)
145667
ns145917
ns1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s)
2125
ns2292
ns0.93
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s)
1750
ns1750
ns1
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s)
2583
ns2166
ns1.19
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s)
1917
ns1520.5
ns1.26
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA
22918.5
ns23935
ns0.96
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s)
5250
ns5125
ns1.02
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s)
5292
ns5042
ns1.05
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s)
5417
ns5458
ns0.99
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s)
5250
ns5084
ns1.03
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA
171129
ns176841
ns0.97
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s)
7583
ns7292
ns1.04
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s)
8125
ns8166
ns0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s)
7500
ns7541
ns0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s)
5125
ns5167
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
81032541
ns80940833
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
39920458
ns41092709
ns0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
45590917
ns45570541
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
153513167
ns153559792
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2660470
ns2660311
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
675206709
ns621714834
ns1.09
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
319221521
ns421739375
ns0.76
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
412689584
ns414510667
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
704326792
ns697568292
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15217384
ns15148414
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
875714312.5
ns872377937.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
502738834
ns706482291.5
ns0.71
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
1160733354
ns1162546146
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
1210583500
ns1175739375
ns1.03
This comment was automatically generated by workflow using github-action-benchmark.