-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: update imagenet training script
- Loading branch information
Showing
10 changed files
with
515 additions
and
49,871 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# [deps] | ||
# AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" | ||
# Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15" | ||
# Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8" | ||
# Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d" | ||
# Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" | ||
# FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" | ||
# FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" | ||
# Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8" | ||
# Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" | ||
# Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0" | ||
# JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" | ||
# JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0" | ||
# Lux = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
# LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" | ||
# MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" | ||
# MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" | ||
# Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc" | ||
# NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b" | ||
# OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" | ||
# Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" | ||
# ParameterSchedulers = "d7d3b36b-41b8-4d0d-a2bf-768c6151755e" | ||
# Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" | ||
# Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" | ||
# SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7" | ||
# Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
# Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" | ||
|
||
# [compat] | ||
# AMDGPU = "1" | ||
# Augmentor = "0.6" | ||
# Boltz = "1" | ||
# Configurations = "0.17" | ||
# FLoops = "0.2" | ||
# FileIO = "1.16" | ||
# Format = "1.3" | ||
# Functors = "0.4" | ||
# Images = "0.26" | ||
# JLD2 = "0.4.46, 0.5" | ||
# JpegTurbo = "0.1" | ||
# Lux = "1" | ||
# LuxCUDA = "0.3" | ||
# MLUtils = "0.4" | ||
# MPI = "0.20.19" | ||
# Metalhead = "0.9" | ||
# NCCL = "0.1.1" | ||
# OneHotArrays = "0.2" | ||
# Optimisers = "0.3" | ||
# ParameterSchedulers = "0.4" | ||
# Setfield = "1" | ||
# SimpleConfig = "0.1" | ||
# Statistics = "1" | ||
# Zygote = "0.6" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,25 @@ | ||
[deps] | ||
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" | ||
Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15" | ||
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8" | ||
Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d" | ||
Comonicon = "863f3e99-da2a-4334-8734-de3dacbe5542" | ||
DataAugmentation = "88a5189c-e7ff-4f85-ac6b-e6158070f02e" | ||
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" | ||
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" | ||
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" | ||
Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8" | ||
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" | ||
Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0" | ||
ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19" | ||
ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1" | ||
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" | ||
JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0" | ||
Lux = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" | ||
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" | ||
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" | ||
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" | ||
Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc" | ||
NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b" | ||
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" | ||
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" | ||
ParameterSchedulers = "d7d3b36b-41b8-4d0d-a2bf-768c6151755e" | ||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" | ||
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" | ||
SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7" | ||
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" | ||
|
||
[compat] | ||
AMDGPU = "1" | ||
Augmentor = "0.6" | ||
Boltz = "0.1, 0.2, 0.3" | ||
Configurations = "0.17" | ||
FLoops = "0.2" | ||
FileIO = "1.16" | ||
Format = "1.3" | ||
Functors = "0.4" | ||
Images = "0.26" | ||
JLD2 = "0.4.46, 0.5" | ||
JpegTurbo = "0.1" | ||
Lux = "1" | ||
LuxCUDA = "0.3" | ||
MLUtils = "0.4" | ||
MPI = "0.20.19" | ||
Metalhead = "0.9" | ||
NCCL = "0.1.1" | ||
OneHotArrays = "0.2" | ||
Optimisers = "0.3" | ||
ParameterSchedulers = "0.4" | ||
Setfield = "1" | ||
SimpleConfig = "0.1" | ||
Statistics = "1" | ||
Zygote = "0.6" | ||
[extras] | ||
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.
1d064ec
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s)
412937.5
ns411895.5
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s)
322667
ns322166
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s)
323104.5
ns322583
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s)
739750
ns741833.5
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA
43577
ns43398
ns1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s)
1320395.5
ns1330145.5
ns0.99
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s)
2436708
ns2429292
ns1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s)
13630167
ns14164208
ns0.96
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s)
2195250
ns2195542
ns1.00
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA
203168.5
ns204689
ns0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s)
1394666
ns1414708
ns0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s)
2614271
ns903146
ns2.89
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s)
13809542
ns1562333
ns8.84
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s)
2256125
ns2259500
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1655084
ns1774625
ns0.93
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1103916
ns1098875
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1549791
ns1541229.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2999729.5
ns2958833
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
207221
ns207635.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12143833.5
ns12148354.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
8785708
ns8822458
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9239167
ns9188791
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18591208
ns18597916
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1485675
ns1492767
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17317333
ns17261541
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
13967208
ns13975000
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14514354
ns14512917
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21818416
ns21852791
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
250042270.5
ns250272292
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
148555750
ns148297917
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
115889000
ns115932000
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
447187584
ns447763458
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5452362
ns5475849
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1224923291
ns1224211792
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
928030208
ns930629834
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
825911895.5
ns831800229
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1633435667
ns1632010250
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
31214910.5
ns31656181.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1134846000
ns1135741875
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
982157791.5
ns989475979.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1328335541.5
ns1306310687.5
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1734630541.5
ns1731093166.5
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
1097854
ns1096084
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
1625083.5
ns1621125
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
3841334
ns3736208
ns1.03
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
778042
ns780896
ns1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA
263538
ns264547.5
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2979417
ns2990250
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
4119104.5
ns4101000
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
11207896
ns11120083
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3132750
ns3243937
ns0.97
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1091322.5
ns1096407
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
2334729
ns2316625
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1437000
ns1427312.5
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1665458.5
ns1669042
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
4198334
ns4212291
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
207913
ns208250.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
19383125
ns19418541
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
16092916.5
ns16073458
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
17269063
ns17186250
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
25856687.5
ns25916083
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1585334
ns1595514
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
34322667
ns34080500.5
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
30864666.5
ns30855979
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
31132250
ns31220895.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
36963875
ns36553708
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
4524500
ns4532459
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2779000
ns2782125
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2902854
ns2915416
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
8387541.5
ns8380354
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
420101
ns423404
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
38904229
ns39032375
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
32105979
ns32155750
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
32346959
ns32313208
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
51945541
ns51945292
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2624775
ns2634739
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
88746333.5
ns88517062.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
114006959
ns114525875
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
224259542
ns223799104.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
74608375
ns74994375
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
267333208
ns268673709
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
159214292
ns159281875
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
126745542
ns126759937.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
487494166
ns485627875
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7012704
ns6994649
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1472344083.5
ns1470701416.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
1138687375
ns1172207375
ns0.97
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
1071038854
ns1063985125
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
2002947479.5
ns2005181229
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34854968.5
ns34667442
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1712616292
ns1715584583
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1536070562.5
ns1547782708
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1863636167
ns1856138292
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
2213962958
ns2207623729.5
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
2080042
ns2030167
ns1.02
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
2936917
ns2979333
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
8042334
ns8195625
ns0.98
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2431104
ns2492167
ns0.98
lenet(28, 28, 1, 128)/forward/GPU/CUDA
278095
ns275800.5
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
9677209
ns9616458.5
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
12036500
ns12040541
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
24751583.5
ns24281208
ns1.02
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
11606292
ns11743000
ns0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1201527
ns1191646.5
ns1.01
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
379827708
ns381538333
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
286677271
ns284123771.5
ns1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
240261834
ns239280375
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
451256520.5
ns452582312.5
ns1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4858918
ns4856388
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
1157780125
ns1153618084
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
905233917
ns934081083
ns0.97
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
987524666
ns921635667
ns1.07
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
1579543625
ns1402852084
ns1.13
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
17849892
ns17820477
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1058583.5
ns1049334
ns1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
1671187.5
ns2035667
ns0.82
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
5011708
ns5341791
ns0.94
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1300437.5
ns1401833
ns0.93
lenet(28, 28, 1, 64)/forward/GPU/CUDA
274747.5
ns273502.5
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
6254041
ns6484854.5
ns0.96
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
13149791.5
ns12404917
ns1.06
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
18860833
ns19763708
ns0.95
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5852333
ns6066500
ns0.96
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1238555
ns1262508
ns0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
70498000
ns70493208
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43638250
ns43600229
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
39557666
ns39622541
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132574187
ns132698750
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1944256
ns1869551
ns1.04
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
356301646
ns356454771
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
269549208
ns270112625
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
253732875
ns254625084
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
534920187.5
ns534884312.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
12320196.5
ns12284081
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
395172666
ns394586709
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
377158375
ns408404250
ns0.92
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
657754625
ns678487209
ns0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
709829333
ns710785791
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
1189792833
ns1185827084
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
691561166.5
ns695281125
ns0.99
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
626986833
ns626587375
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
1860884792
ns1770240250.5
ns1.05
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12309151
ns12316473
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
3633655916
ns3682510624.5
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
2828990458
ns2820068709
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
2702591209
ns2719913792
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
5056811500
ns5042856917
ns1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49201169.5
ns49349519
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3425625
ns3418708
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2072958.5
ns2082958
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2525541
ns2540500
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6028666.5
ns6009666
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
322034
ns326984.5
ns0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
25910625
ns26222333
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18853584
ns18914875
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19458875
ns19338354.5
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
39298645.5
ns39331500
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2474706.5
ns2462554
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
54292500
ns56235458
ns0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
81331292
ns80835708
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
170565562
ns169939541.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
45567333
ns45337209
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1782916
ns1784584
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1103709
ns1103604.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1548917
ns1566667
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
3027375
ns3032666
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
210691.5
ns213488
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12525854
ns12541520.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9206541.5
ns9203729.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9628792
ns9646562.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
19005604.5
ns18999125
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1537547
ns1533148
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17655854
ns17653167
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14331645.5
ns14324854.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14600583
ns14581208.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
22163250
ns22196729.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
70499459
ns70444166.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43573833
ns43349417
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
39479542
ns39635792
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132481104.5
ns132857146
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1867593
ns1867040.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
360531229
ns361132208
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
345233354
ns345972771
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
303345083
ns304480083
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
722647875
ns724393166
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13388759.5
ns13316474.5
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
418893124.5
ns419098563
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
418550083
ns427616083
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
733622021
ns709012833
ns1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
714074250
ns714833666
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
1662791
ns1667417
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
1326395.5
ns1348521
ns0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
1266458.5
ns1328125
ns0.95
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
2293875
ns2425812.5
ns0.95
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
584223
ns582318.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
8911021
ns9008145.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
12871250
ns12958667
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
31057917
ns31132854
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
9825729.5
ns9869291.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1434469
ns1441009.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
16503167
ns18356416
ns0.90
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
20919875
ns17371542
ns1.20
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
44942437.5
ns29956292
ns1.50
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
13103167
ns14108833.5
ns0.93
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s)
789458
ns696500
ns1.13
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s)
538437.5
ns500812.5
ns1.08
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s)
1024041.5
ns1033458
ns0.99
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s)
725041
ns725500
ns1.00
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA
47144.5
ns47684
ns0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s)
1463416
ns1577042
ns0.93
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s)
1040312
ns1051000
ns0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s)
1411187.5
ns1370125
ns1.03
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s)
2257916
ns2303687
ns0.98
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA
234270.5
ns238125.5
ns0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s)
1530583
ns1558687.5
ns0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s)
1024209
ns1045916.5
ns0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s)
1524333
ns1461916
ns1.04
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s)
2201771
ns2228229
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3406354
ns3409125
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2052854.5
ns2066270.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2507959
ns2525500
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6001333
ns6013791
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
287105.5
ns289794
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24055375
ns24055187.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
17211646
ns17202000
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17114333
ns17133562.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
37572333
ns37566146
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2401548.5
ns2407993
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
52614646
ns54216541.5
ns0.97
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
82221646
ns83772249.5
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
169582250
ns166696854
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
44570125
ns44480500
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
250290791.5
ns250124083
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
148276667
ns148262625
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
115710770.5
ns115870833.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
447663770.5
ns448008979
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5443484
ns5471191
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1105632500
ns1103626125
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
854893979
ns857579145.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
827018271
ns827690125
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1767047166
ns1753891459
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
28898282.5
ns29184619
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1021345729.5
ns1020122687.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
974787791
ns963543750
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1329964041.5
ns1321081709
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1731428604.5
ns1722584666.5
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1243062.5
ns1339041
ns0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
955375
ns970521
ns0.98
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
906875
ns971250
ns0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2048500
ns1954229
ns1.05
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
563206.5
ns575255
ns0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
5919958
ns6044667
ns0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
6419604
ns6367750
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
23873812
ns25225645.5
ns0.95
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
7097854.5
ns7126291.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1364575
ns1413961
ns0.97
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
9591542
ns11543792
ns0.83
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
13052166.5
ns9916958
ns1.32
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
31360875
ns18299375
ns1.71
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
7260167
ns8684625
ns0.84
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s)
481625
ns389458
ns1.24
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s)
443500
ns359500
ns1.23
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s)
1999500
ns2193750
ns0.91
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s)
87833
ns88042
ns1.00
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA
27760
ns28321
ns0.98
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s)
377333
ns396895.5
ns0.95
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s)
439500
ns440458
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s)
4505250
ns4619792
ns0.98
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s)
258291
ns265375
ns0.97
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA
219213
ns227034
ns0.97
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s)
408166.5
ns429500
ns0.95
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s)
470125
ns470792
ns1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s)
4495000
ns4863667
ns0.92
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s)
271000
ns271041.5
ns1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s)
427792
ns333667
ns1.28
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s)
376896
ns294437
ns1.28
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s)
733562.5
ns740833.5
ns0.99
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s)
52417
ns52854.5
ns0.99
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA
28102
ns28719
ns0.98
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s)
336500
ns366250
ns0.92
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s)
333854
ns335625
ns0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s)
419229.5
ns813354
ns0.52
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s)
151562.5
ns155312
ns0.98
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA
204281.5
ns212500
ns0.96
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s)
351375
ns377292
ns0.93
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s)
348375
ns351833
ns0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s)
899625
ns908146
ns0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s)
150667
ns150959
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
603094958
ns602630000
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
428615062.5
ns425446812.5
ns1.01
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
384740459
ns373310250
ns1.03
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
873854208.5
ns874089917
ns1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7027277
ns7031776
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
2002711146
ns2002661937.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
1606403375
ns1606159333.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
1551092146
ns1596550354
ns0.97
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
2631708917
ns2634530667
ns1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26123824
ns25871825
ns1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s)
521396
ns520500
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s)
431875
ns438417
ns0.99
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s)
1926416
ns1781667
ns1.08
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s)
866417
ns875958
ns0.99
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA
47024
ns47831
ns0.98
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s)
1855270.5
ns1845875
ns1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s)
2793583
ns2803167
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s)
14609250
ns14421478.5
ns1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s)
2648521
ns2762500
ns0.96
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA
246347
ns254912
ns0.97
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s)
1974875
ns1952312.5
ns1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s)
5038917
ns5063375
ns1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s)
15177854.5
ns14644042
ns1.04
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s)
2744270.5
ns2791250
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1512729
ns1556042
ns0.97
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1178292
ns1242250
ns0.95
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1180084
ns1188250
ns0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2300375
ns2351542
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
589242.5
ns587337
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
5245791
ns5962500.5
ns0.88
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
4733604
ns4728083
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
24184833
ns25706291.5
ns0.94
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
7316583
ns7342917
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
1392514
ns1398774.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
11607209
ns13281166.5
ns0.87
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
16305271
ns11246833
ns1.45
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
35977250
ns20840520.5
ns1.73
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
9550875
ns10611542
ns0.90
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s)
2333
ns2770.5
ns0.84
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s)
2542
ns4875
ns0.52
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s)
3083
ns2875
ns1.07
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s)
2458
ns2500
ns0.98
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA
25059
ns25042
ns1.00
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s)
7417
ns7084
ns1.05
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s)
7042
ns7166
ns0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s)
7209
ns7333
ns0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s)
7333
ns7208
ns1.02
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA
214253
ns216255
ns0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s)
8333
ns8500
ns0.98
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s)
8083
ns8125
ns0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s)
8437.5
ns8291
ns1.02
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s)
6125
ns5958
ns1.03
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s)
10667
ns11416
ns0.93
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s)
13791
ns14208
ns0.97
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s)
11208
ns10958
ns1.02
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s)
7375
ns7167
ns1.03
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA
25157
ns25461
ns0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s)
20062.5
ns19792
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s)
19833
ns20084
ns0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s)
20041
ns20041
ns1
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s)
20000
ns19875
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA
235669
ns236659.5
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s)
23562.5
ns23500
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s)
23417
ns23500
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s)
23625
ns23791
ns0.99
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s)
21458
ns21375
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s)
28584
ns28479.5
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s)
28916
ns28917
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s)
29417
ns28625
ns1.03
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s)
46000
ns46083
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA
26406
ns26494
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s)
221667
ns222354.5
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s)
278604.5
ns275541.5
ns1.01
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s)
4081750
ns4186187.5
ns0.98
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s)
145833
ns145834
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA
208494.5
ns209501
ns1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s)
237333
ns333708
ns0.71
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s)
295625
ns313334
ns0.94
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s)
4027625
ns519937.5
ns7.75
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s)
145875
ns161041
ns0.91
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s)
2083.5
ns1729.5
ns1.20
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s)
1917
ns4584
ns0.42
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s)
2458
ns2417
ns1.02
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s)
1791
ns2084
ns0.86
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA
23206
ns23313.5
ns1.00
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s)
5333
ns5500
ns0.97
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s)
5167
ns5250
ns0.98
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s)
5375
ns5209
ns1.03
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s)
5250
ns5209
ns1.01
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA
238219
ns243931.5
ns0.98
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s)
7292
ns11250
ns0.65
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s)
7291
ns11291
ns0.65
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s)
7542
ns11417
ns0.66
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s)
5333
ns6917
ns0.77
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
79904000
ns79906375
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
49166750
ns49046500
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
44974542
ns44994417
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
151504667
ns151607583
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2718498
ns2724784.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
496218625
ns664456250
ns0.75
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
410097125
ns410418958
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
397607667
ns401447083
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
684031750
ns682646167
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
14583158
ns14573408
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
709703166.5
ns711193604.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
675407250
ns688899625
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
1001028958
ns1017383917
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
995697250
ns998024084
ns1.00
This comment was automatically generated by workflow using github-action-benchmark.