Skip to content

Commit

Permalink
feat: update imagenet training script
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 15, 2024
1 parent 5200f58 commit 1d064ec
Show file tree
Hide file tree
Showing 10 changed files with 515 additions and 49,871 deletions.
8 changes: 0 additions & 8 deletions docs/src/ecosystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,6 @@ const autodiff = [
];
const dataload = [
{
avatar: 'https://github.com/evizero.png',
name: 'Augmentor.jl',
desc: 'Data augmentation for machine learning',
links: [
{ icon: 'github', link: 'https://github.com/evizero/Augmentor.jl' }
]
},
{
avatar: 'https://github.com/JuliaML.png',
name: 'MLUtils.jl',
Expand Down
53 changes: 53 additions & 0 deletions examples/ImageNet/Project copy.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# [deps]
# AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
# Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15"
# Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
# Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d"
# Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
# FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
# FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
# Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
# Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
# Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
# JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
# JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
# Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
# LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
# MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
# MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
# Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
# NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
# OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
# Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
# ParameterSchedulers = "d7d3b36b-41b8-4d0d-a2bf-768c6151755e"
# Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
# Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
# SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7"
# Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
# Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

# [compat]
# AMDGPU = "1"
# Augmentor = "0.6"
# Boltz = "1"
# Configurations = "0.17"
# FLoops = "0.2"
# FileIO = "1.16"
# Format = "1.3"
# Functors = "0.4"
# Images = "0.26"
# JLD2 = "0.4.46, 0.5"
# JpegTurbo = "0.1"
# Lux = "1"
# LuxCUDA = "0.3"
# MLUtils = "0.4"
# MPI = "0.20.19"
# Metalhead = "0.9"
# NCCL = "0.1.1"
# OneHotArrays = "0.2"
# Optimisers = "0.3"
# ParameterSchedulers = "0.4"
# Setfield = "1"
# SimpleConfig = "0.1"
# Statistics = "1"
# Zygote = "0.6"
42 changes: 7 additions & 35 deletions examples/ImageNet/Project.toml
Original file line number Diff line number Diff line change
@@ -1,53 +1,25 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15"
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d"
Comonicon = "863f3e99-da2a-4334-8734-de3dacbe5542"
DataAugmentation = "88a5189c-e7ff-4f85-ac6b-e6158070f02e"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
ParameterSchedulers = "d7d3b36b-41b8-4d0d-a2bf-768c6151755e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
AMDGPU = "1"
Augmentor = "0.6"
Boltz = "0.1, 0.2, 0.3"
Configurations = "0.17"
FLoops = "0.2"
FileIO = "1.16"
Format = "1.3"
Functors = "0.4"
Images = "0.26"
JLD2 = "0.4.46, 0.5"
JpegTurbo = "0.1"
Lux = "1"
LuxCUDA = "0.3"
MLUtils = "0.4"
MPI = "0.20.19"
Metalhead = "0.9"
NCCL = "0.1.1"
OneHotArrays = "0.2"
Optimisers = "0.3"
ParameterSchedulers = "0.4"
Setfield = "1"
SimpleConfig = "0.1"
Statistics = "1"
Zygote = "0.6"
[extras]
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
163 changes: 67 additions & 96 deletions examples/ImageNet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,27 @@ the ImageNet dataset.
## Training

To train a model, run `main.jl` with the necessary parameters. See
[Boltz documentation](https://lux.csail.mit.edu/dev/api/Domain_Specific_Modeling/Boltz) for
the model configuration.
[Boltz documentation](https://luxdl.github.io/Boltz.jl/stable/) for the model configuration.

```bash
julia --project=examples/ImageNet -t auto examples/ImageNet/main.jl\
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
--cfg.optimizer.learning_rate=0.5

julia --project=examples/ImageNet -t auto examples/ImageNet/main.jl\
--cfg.model.name=alexnet --cfg.model.arch=alexnet\
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
--cfg.optimizer.learning_rate=0.01
julia --startup=no --project=examples/ImageNet -t auto examples/ImageNet/main.jl \
--model-name="VGG" \
--depth=19 \
--train-batchsize=256 \
--val-batchsize=256 \
--optimizer-kind="sgd" \
--learning-rate=0.01 \
--base-path="/home/avik-pal/data/ImageNet/"


julia --startup=no --project=examples/ImageNet -t auto examples/ImageNet/main.jl \
--model-name="ViT" \
--model-kind="tiny" \
--train-batchsize=256 \
--val-batchsize=256 \
--optimizer-kind="sgd" \
--learning-rate=0.01 \
--base-path="/home/avik-pal/data/ImageNet/"
```

## Distributed Data Parallel Training
Expand All @@ -37,93 +44,57 @@ If your system has functional NCCL we will use it for all CUDA communications. O
will use MPI for all communications.

```bash
mpiexecjl -np 4 julia --project=examples/ImageNet -t auto examples/ImageNet/main.jl\
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
--cfg.optimizer.learning_rate=0.01
mpiexecjl -np 4 julia --startup=no --project=examples/ImageNet -t auto\
examples/ImageNet/main.jl \
--model-name="ViT" \
--model-kind="tiny" \
--train-batchsize=256 \
--val-batchsize=256 \
--optimizer-kind="sgd" \
--learning-rate=0.01 \
--base-path="/home/avik-pal/data/ImageNet/"
```

## Usage

```bash
usage: main.jl [--cfg.seed CFG.SEED] [--cfg.model.name CFG.MODEL.NAME]
[--cfg.model.arch CFG.MODEL.ARCH]
[--cfg.model.pretrained CFG.MODEL.PRETRAINED]
[--cfg.optimizer.name CFG.OPTIMIZER.NAME]
[--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE]
[--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV]
[--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM]
[--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY]
[--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME]
[--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH]
[--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR]
[--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP]
[--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY]
[--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS]
[--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY]
[--cfg.train.resume CFG.TRAIN.RESUME]
[--cfg.train.evaluate CFG.TRAIN.EVALUATE]
[--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR]
[--cfg.train.log_dir CFG.TRAIN.LOG_DIR]
[--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR]
[--cfg.train.expt_id CFG.TRAIN.EXPT_ID]
[--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY]
[--cfg.dataset.data_root CFG.DATASET.DATA_ROOT]
[--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE]
[--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE]
[-h]

optional arguments:
--cfg.seed CFG.SEED (type: Int64, default: 12345)
--cfg.model.name CFG.MODEL.NAME
(default: "resnet")
--cfg.model.arch CFG.MODEL.ARCH
(default: "resnet18")
--cfg.model.pretrained CFG.MODEL.PRETRAINED
(type: Bool, default: false)
--cfg.optimizer.name CFG.OPTIMIZER.NAME
(default: "adam")
--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE
(type: Float32, default: 0.01)
--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV
(type: Bool, default: false)
--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM
(type: Float32, default: 0.0)
--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY
(type: Float32, default: 0.0)
--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME
(default: "step")
--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH
(type: Int64, default: 50000)
--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR
(type: Float32, default: 1.2)
--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP
(type: Vector{Int64}, default: [100000, 250000, 500000])
--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY
(type: Float32, default: 0.1)
--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS
(type: Int64, default: 800000)
--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY
(type: Int64, default: 10000)
--cfg.train.resume CFG.TRAIN.RESUME
(default: "")
--cfg.train.evaluate CFG.TRAIN.EVALUATE
(type: Bool, default: false)
--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR
(default: "checkpoints")
--cfg.train.log_dir CFG.TRAIN.LOG_DIR
(default: "logs")
--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR
(default: "")
--cfg.train.expt_id CFG.TRAIN.EXPT_ID
(default: "")
--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY
(type: Int64, default: 100)
--cfg.dataset.data_root CFG.DATASET.DATA_ROOT
(default: "")
--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE
(type: Int64, default: 64)
--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE
(type: Int64, default: 64)
-h, --help show this help message and exit
main

Usage

main [options] [flags]

Options

--seed <0::Integer>
--model-name <String>
--model-kind <nokind::String>
--depth <-1::Int>
--base-path <::String>
--train-batchsize <64::Int>
--val-batchsize <64::Int>
--image-size <-1::Int>
--optimizer-kind <sgd::String>
--learning-rate <0.01::Float32>
--momentum <0.0::Float32>
--weight-decay <0.0::Float32>
--scheduler-kind <step::String>
--cycle-length <50000::Int>
--damp-factor <1.2::Float32>
--lr-step-decay <0.1::Float32>
--lr-step <[100000...::Vector{Int64}>
--expt-id <::String>
--expt-subdir <#= /home...::String>
--resume <::String>
--total-steps <800000::Int>
--evaluate-every <10000::Integer>
--print-frequency <100::Integer>

Flags

--pretrained
--nesterov
--evaluate
-h, --help Print this help message.
--version Print version.
```
48 changes: 0 additions & 48 deletions examples/ImageNet/config.jl

This file was deleted.

Loading

1 comment on commit 1d064ec

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 1d064ec Previous: 5200f58 Ratio
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s) 412937.5 ns 411895.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s) 322667 ns 322166 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s) 323104.5 ns 322583 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s) 739750 ns 741833.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA 43577 ns 43398 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s) 1320395.5 ns 1330145.5 ns 0.99
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s) 2436708 ns 2429292 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s) 13630167 ns 14164208 ns 0.96
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s) 2195250 ns 2195542 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA 203168.5 ns 204689 ns 0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s) 1394666 ns 1414708 ns 0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s) 2614271 ns 903146 ns 2.89
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s) 13809542 ns 1562333 ns 8.84
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s) 2256125 ns 2259500 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1655084 ns 1774625 ns 0.93
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1103916 ns 1098875 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1549791 ns 1541229.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2999729.5 ns 2958833 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 207221 ns 207635.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12143833.5 ns 12148354.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 8785708 ns 8822458 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9239167 ns 9188791 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18591208 ns 18597916 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1485675 ns 1492767 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17317333 ns 17261541 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 13967208 ns 13975000 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14514354 ns 14512917 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21818416 ns 21852791 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250042270.5 ns 250272292 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148555750 ns 148297917 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 115889000 ns 115932000 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447187584 ns 447763458 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5452362 ns 5475849 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1224923291 ns 1224211792 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 928030208 ns 930629834 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 825911895.5 ns 831800229 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1633435667 ns 1632010250 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 31214910.5 ns 31656181.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1134846000 ns 1135741875 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 982157791.5 ns 989475979.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1328335541.5 ns 1306310687.5 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1734630541.5 ns 1731093166.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 1097854 ns 1096084 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 1625083.5 ns 1621125 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 3841334 ns 3736208 ns 1.03
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 778042 ns 780896 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 263538 ns 264547.5 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2979417 ns 2990250 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 4119104.5 ns 4101000 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 11207896 ns 11120083 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3132750 ns 3243937 ns 0.97
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1091322.5 ns 1096407 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 2334729 ns 2316625 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1437000 ns 1427312.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1665458.5 ns 1669042 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 4198334 ns 4212291 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 207913 ns 208250.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 19383125 ns 19418541 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 16092916.5 ns 16073458 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 17269063 ns 17186250 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 25856687.5 ns 25916083 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1585334 ns 1595514 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 34322667 ns 34080500.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 30864666.5 ns 30855979 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 31132250 ns 31220895.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 36963875 ns 36553708 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 4524500 ns 4532459 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2779000 ns 2782125 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2902854 ns 2915416 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 8387541.5 ns 8380354 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 420101 ns 423404 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 38904229 ns 39032375 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 32105979 ns 32155750 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 32346959 ns 32313208 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 51945541 ns 51945292 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2624775 ns 2634739 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 88746333.5 ns 88517062.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 114006959 ns 114525875 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 224259542 ns 223799104.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 74608375 ns 74994375 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 267333208 ns 268673709 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 159214292 ns 159281875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 126745542 ns 126759937.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 487494166 ns 485627875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7012704 ns 6994649 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1472344083.5 ns 1470701416.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 1138687375 ns 1172207375 ns 0.97
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 1071038854 ns 1063985125 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 2002947479.5 ns 2005181229 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34854968.5 ns 34667442 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1712616292 ns 1715584583 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1536070562.5 ns 1547782708 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1863636167 ns 1856138292 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 2213962958 ns 2207623729.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 2080042 ns 2030167 ns 1.02
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 2936917 ns 2979333 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 8042334 ns 8195625 ns 0.98
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2431104 ns 2492167 ns 0.98
lenet(28, 28, 1, 128)/forward/GPU/CUDA 278095 ns 275800.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 9677209 ns 9616458.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 12036500 ns 12040541 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 24751583.5 ns 24281208 ns 1.02
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 11606292 ns 11743000 ns 0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1201527 ns 1191646.5 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 379827708 ns 381538333 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 286677271 ns 284123771.5 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 240261834 ns 239280375 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 451256520.5 ns 452582312.5 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4858918 ns 4856388 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 1157780125 ns 1153618084 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 905233917 ns 934081083 ns 0.97
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 987524666 ns 921635667 ns 1.07
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 1579543625 ns 1402852084 ns 1.13
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 17849892 ns 17820477 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1058583.5 ns 1049334 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 1671187.5 ns 2035667 ns 0.82
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 5011708 ns 5341791 ns 0.94
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1300437.5 ns 1401833 ns 0.93
lenet(28, 28, 1, 64)/forward/GPU/CUDA 274747.5 ns 273502.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6254041 ns 6484854.5 ns 0.96
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 13149791.5 ns 12404917 ns 1.06
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 18860833 ns 19763708 ns 0.95
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5852333 ns 6066500 ns 0.96
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1238555 ns 1262508 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70498000 ns 70493208 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43638250 ns 43600229 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39557666 ns 39622541 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132574187 ns 132698750 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1944256 ns 1869551 ns 1.04
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 356301646 ns 356454771 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 269549208 ns 270112625 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 253732875 ns 254625084 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 534920187.5 ns 534884312.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 12320196.5 ns 12284081 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 395172666 ns 394586709 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 377158375 ns 408404250 ns 0.92
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 657754625 ns 678487209 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 709829333 ns 710785791 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 1189792833 ns 1185827084 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 691561166.5 ns 695281125 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 626986833 ns 626587375 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 1860884792 ns 1770240250.5 ns 1.05
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12309151 ns 12316473 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 3633655916 ns 3682510624.5 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 2828990458 ns 2820068709 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 2702591209 ns 2719913792 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 5056811500 ns 5042856917 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49201169.5 ns 49349519 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3425625 ns 3418708 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2072958.5 ns 2082958 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2525541 ns 2540500 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6028666.5 ns 6009666 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 322034 ns 326984.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 25910625 ns 26222333 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18853584 ns 18914875 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19458875 ns 19338354.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 39298645.5 ns 39331500 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2474706.5 ns 2462554 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 54292500 ns 56235458 ns 0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 81331292 ns 80835708 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 170565562 ns 169939541.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 45567333 ns 45337209 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1782916 ns 1784584 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1103709 ns 1103604.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1548917 ns 1566667 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 3027375 ns 3032666 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 210691.5 ns 213488 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12525854 ns 12541520.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9206541.5 ns 9203729.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9628792 ns 9646562.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 19005604.5 ns 18999125 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1537547 ns 1533148 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17655854 ns 17653167 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14331645.5 ns 14324854.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14600583 ns 14581208.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 22163250 ns 22196729.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70499459 ns 70444166.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43573833 ns 43349417 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39479542 ns 39635792 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132481104.5 ns 132857146 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1867593 ns 1867040.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 360531229 ns 361132208 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 345233354 ns 345972771 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 303345083 ns 304480083 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 722647875 ns 724393166 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13388759.5 ns 13316474.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 418893124.5 ns 419098563 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 418550083 ns 427616083 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 733622021 ns 709012833 ns 1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 714074250 ns 714833666 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 1662791 ns 1667417 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 1326395.5 ns 1348521 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 1266458.5 ns 1328125 ns 0.95
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 2293875 ns 2425812.5 ns 0.95
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 584223 ns 582318.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 8911021 ns 9008145.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 12871250 ns 12958667 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 31057917 ns 31132854 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 9825729.5 ns 9869291.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1434469 ns 1441009.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 16503167 ns 18356416 ns 0.90
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 20919875 ns 17371542 ns 1.20
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 44942437.5 ns 29956292 ns 1.50
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 13103167 ns 14108833.5 ns 0.93
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s) 789458 ns 696500 ns 1.13
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s) 538437.5 ns 500812.5 ns 1.08
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s) 1024041.5 ns 1033458 ns 0.99
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s) 725041 ns 725500 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA 47144.5 ns 47684 ns 0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s) 1463416 ns 1577042 ns 0.93
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s) 1040312 ns 1051000 ns 0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s) 1411187.5 ns 1370125 ns 1.03
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s) 2257916 ns 2303687 ns 0.98
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA 234270.5 ns 238125.5 ns 0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s) 1530583 ns 1558687.5 ns 0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s) 1024209 ns 1045916.5 ns 0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s) 1524333 ns 1461916 ns 1.04
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s) 2201771 ns 2228229 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3406354 ns 3409125 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2052854.5 ns 2066270.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2507959 ns 2525500 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6001333 ns 6013791 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 287105.5 ns 289794 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24055375 ns 24055187.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17211646 ns 17202000 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17114333 ns 17133562.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37572333 ns 37566146 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2401548.5 ns 2407993 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 52614646 ns 54216541.5 ns 0.97
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 82221646 ns 83772249.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 169582250 ns 166696854 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 44570125 ns 44480500 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250290791.5 ns 250124083 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148276667 ns 148262625 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 115710770.5 ns 115870833.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447663770.5 ns 448008979 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5443484 ns 5471191 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1105632500 ns 1103626125 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 854893979 ns 857579145.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 827018271 ns 827690125 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1767047166 ns 1753891459 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 28898282.5 ns 29184619 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1021345729.5 ns 1020122687.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 974787791 ns 963543750 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1329964041.5 ns 1321081709 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1731428604.5 ns 1722584666.5 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1243062.5 ns 1339041 ns 0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 955375 ns 970521 ns 0.98
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 906875 ns 971250 ns 0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2048500 ns 1954229 ns 1.05
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 563206.5 ns 575255 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 5919958 ns 6044667 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 6419604 ns 6367750 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 23873812 ns 25225645.5 ns 0.95
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 7097854.5 ns 7126291.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1364575 ns 1413961 ns 0.97
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 9591542 ns 11543792 ns 0.83
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 13052166.5 ns 9916958 ns 1.32
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 31360875 ns 18299375 ns 1.71
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 7260167 ns 8684625 ns 0.84
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s) 481625 ns 389458 ns 1.24
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s) 443500 ns 359500 ns 1.23
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s) 1999500 ns 2193750 ns 0.91
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s) 87833 ns 88042 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA 27760 ns 28321 ns 0.98
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s) 377333 ns 396895.5 ns 0.95
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s) 439500 ns 440458 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s) 4505250 ns 4619792 ns 0.98
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s) 258291 ns 265375 ns 0.97
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA 219213 ns 227034 ns 0.97
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s) 408166.5 ns 429500 ns 0.95
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s) 470125 ns 470792 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s) 4495000 ns 4863667 ns 0.92
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s) 271000 ns 271041.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s) 427792 ns 333667 ns 1.28
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s) 376896 ns 294437 ns 1.28
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s) 733562.5 ns 740833.5 ns 0.99
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s) 52417 ns 52854.5 ns 0.99
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA 28102 ns 28719 ns 0.98
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s) 336500 ns 366250 ns 0.92
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s) 333854 ns 335625 ns 0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s) 419229.5 ns 813354 ns 0.52
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s) 151562.5 ns 155312 ns 0.98
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA 204281.5 ns 212500 ns 0.96
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s) 351375 ns 377292 ns 0.93
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s) 348375 ns 351833 ns 0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s) 899625 ns 908146 ns 0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s) 150667 ns 150959 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 603094958 ns 602630000 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 428615062.5 ns 425446812.5 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 384740459 ns 373310250 ns 1.03
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 873854208.5 ns 874089917 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7027277 ns 7031776 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 2002711146 ns 2002661937.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 1606403375 ns 1606159333.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 1551092146 ns 1596550354 ns 0.97
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 2631708917 ns 2634530667 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26123824 ns 25871825 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s) 521396 ns 520500 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s) 431875 ns 438417 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s) 1926416 ns 1781667 ns 1.08
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s) 866417 ns 875958 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA 47024 ns 47831 ns 0.98
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s) 1855270.5 ns 1845875 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s) 2793583 ns 2803167 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s) 14609250 ns 14421478.5 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s) 2648521 ns 2762500 ns 0.96
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA 246347 ns 254912 ns 0.97
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s) 1974875 ns 1952312.5 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s) 5038917 ns 5063375 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s) 15177854.5 ns 14644042 ns 1.04
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s) 2744270.5 ns 2791250 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1512729 ns 1556042 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1178292 ns 1242250 ns 0.95
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1180084 ns 1188250 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2300375 ns 2351542 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 589242.5 ns 587337 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 5245791 ns 5962500.5 ns 0.88
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 4733604 ns 4728083 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 24184833 ns 25706291.5 ns 0.94
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 7316583 ns 7342917 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1392514 ns 1398774.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 11607209 ns 13281166.5 ns 0.87
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 16305271 ns 11246833 ns 1.45
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 35977250 ns 20840520.5 ns 1.73
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 9550875 ns 10611542 ns 0.90
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s) 2333 ns 2770.5 ns 0.84
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s) 2542 ns 4875 ns 0.52
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s) 3083 ns 2875 ns 1.07
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s) 2458 ns 2500 ns 0.98
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA 25059 ns 25042 ns 1.00
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s) 7417 ns 7084 ns 1.05
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s) 7042 ns 7166 ns 0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s) 7209 ns 7333 ns 0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s) 7333 ns 7208 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA 214253 ns 216255 ns 0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s) 8333 ns 8500 ns 0.98
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s) 8083 ns 8125 ns 0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s) 8437.5 ns 8291 ns 1.02
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s) 6125 ns 5958 ns 1.03
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s) 10667 ns 11416 ns 0.93
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s) 13791 ns 14208 ns 0.97
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s) 11208 ns 10958 ns 1.02
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s) 7375 ns 7167 ns 1.03
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA 25157 ns 25461 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s) 20062.5 ns 19792 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s) 19833 ns 20084 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s) 20041 ns 20041 ns 1
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s) 20000 ns 19875 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA 235669 ns 236659.5 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s) 23562.5 ns 23500 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s) 23417 ns 23500 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s) 23625 ns 23791 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s) 21458 ns 21375 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s) 28584 ns 28479.5 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s) 28916 ns 28917 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s) 29417 ns 28625 ns 1.03
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s) 46000 ns 46083 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA 26406 ns 26494 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s) 221667 ns 222354.5 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s) 278604.5 ns 275541.5 ns 1.01
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s) 4081750 ns 4186187.5 ns 0.98
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s) 145833 ns 145834 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA 208494.5 ns 209501 ns 1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s) 237333 ns 333708 ns 0.71
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s) 295625 ns 313334 ns 0.94
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s) 4027625 ns 519937.5 ns 7.75
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s) 145875 ns 161041 ns 0.91
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s) 2083.5 ns 1729.5 ns 1.20
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s) 1917 ns 4584 ns 0.42
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s) 2458 ns 2417 ns 1.02
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s) 1791 ns 2084 ns 0.86
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA 23206 ns 23313.5 ns 1.00
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s) 5333 ns 5500 ns 0.97
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s) 5167 ns 5250 ns 0.98
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s) 5375 ns 5209 ns 1.03
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s) 5250 ns 5209 ns 1.01
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA 238219 ns 243931.5 ns 0.98
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s) 7292 ns 11250 ns 0.65
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s) 7291 ns 11291 ns 0.65
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s) 7542 ns 11417 ns 0.66
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s) 5333 ns 6917 ns 0.77
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 79904000 ns 79906375 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 49166750 ns 49046500 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 44974542 ns 44994417 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 151504667 ns 151607583 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2718498 ns 2724784.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 496218625 ns 664456250 ns 0.75
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 410097125 ns 410418958 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 397607667 ns 401447083 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 684031750 ns 682646167 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 14583158 ns 14573408 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 709703166.5 ns 711193604.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 675407250 ns 688899625 ns 0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 1001028958 ns 1017383917 ns 0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 995697250 ns 998024084 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.