From 87d8c27bbbc90b3809245ac438ebdf98ba811511 Mon Sep 17 00:00:00 2001 From: github-action-benchmark Date: Fri, 18 Oct 2024 18:55:37 +0000 Subject: [PATCH] add LuxLib Benchmarks (julia) benchmark result for 98a2d7ad69cba4a97a848d8e0e4f7419c543fda2 --- benchmarks/data.js | 8306 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 8305 insertions(+), 1 deletion(-) diff --git a/benchmarks/data.js b/benchmarks/data.js index 95ab5267..bc2db9ec 100644 --- a/benchmarks/data.js +++ b/benchmarks/data.js @@ -1,5 +1,5 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1729120928925, + "lastUpdate": 1729277737004, "repoUrl": "https://github.com/LuxDL/LuxLib.jl", "entries": { "LuxLib Benchmarks": [ @@ -377306,6 +377306,8310 @@ window.BENCHMARK_DATA = { "extra": "gctime=0\nmemory=74264\nallocs=1456\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" } ] + }, + { + "commit": { + "author": { + "email": "avikpal@mit.edu", + "name": "Avik Pal", + "username": "avik-pal" + }, + "committer": { + "email": "noreply@github.com", + "name": "GitHub", + "username": "web-flow" + }, + "distinct": true, + "id": "98a2d7ad69cba4a97a848d8e0e4f7419c543fda2", + "message": "refactor: move `JuliaSIMD` deps to extensions (#175)\n\n* fix: remove LV.vmap! usage\r\n\r\n* fix: remove LV handling for bias_activation\r\n\r\n* fix: remove LV usage in dropout\r\n\r\n* refactor: move LV and octavian behind an extension\r\n\r\n* docs: add docs for loading packages\r\n\r\n* refactor: move SLEEFPirates to an ext\r\n\r\n* fix: enzyme rules for batched matmul\r\n\r\n* fix: patch more enzyme issues\r\n\r\n* feat: add a preference to disable loop vectorization\r\n\r\n* fix: incorrect dispatch called\r\n\r\n* fix: enzyme segfault bypass", + "timestamp": "2024-10-18T14:04:57-04:00", + "tree_id": "ee2e6117dbe3adfe91957fc9a8e5cd3e20c2df52", + "url": "https://github.com/LuxDL/LuxLib.jl/commit/98a2d7ad69cba4a97a848d8e0e4f7419c543fda2" + }, + "date": 1729277734832, + "tool": "julia", + "benches": [ + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 6417, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 6041, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7167, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5292, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 103542, + "unit": "ns", + "extra": "gctime=0\nmemory=9952\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 637131, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 10166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9958, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 9979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 494284, + "unit": "ns", + "extra": "gctime=0\nmemory=66016\nallocs=2465\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 719725, + "unit": "ns", + "extra": "gctime=0\nmemory=55376\nallocs=1185\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 1583, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 1542, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 1666, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 1500, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 20684, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 33302, + "unit": "ns", + "extra": "gctime=0\nmemory=1584\nallocs=51\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 3812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 4250, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 4334, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 134278.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14376\nallocs=530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 143062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16168\nallocs=374\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 58000, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 46417, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46875, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 83750, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 37449, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 70883, + "unit": "ns", + "extra": "gctime=0\nmemory=7056\nallocs=206\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2037500, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2083416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2090916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1996979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 220080, + "unit": "ns", + "extra": "gctime=0\nmemory=22712\nallocs=712\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1213928, + "unit": "ns", + "extra": "gctime=0\nmemory=90056\nallocs=2122\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 173708, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 146625, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 165062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 172000, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 167869.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13152\nallocs=479\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 196051.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19056\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1113854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1110541, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1118667, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1124479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 644177, + "unit": "ns", + "extra": "gctime=0\nmemory=70912\nallocs=2543\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 899376, + "unit": "ns", + "extra": "gctime=0\nmemory=74736\nallocs=1515\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 5333, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 4875, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6750, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 4416, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 83066, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=455\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 64020, + "unit": "ns", + "extra": "gctime=0\nmemory=10704\nallocs=258\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8584, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8750, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8875, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8584, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 552192.5, + "unit": "ns", + "extra": "gctime=0\nmemory=67008\nallocs=2588\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 372446, + "unit": "ns", + "extra": "gctime=0\nmemory=56992\nallocs=1211\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 17229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 17250, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 21542, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 17208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 63166, + "unit": "ns", + "extra": "gctime=0\nmemory=9920\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 79573.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16256\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 220583, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 218875, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 223125, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 219625, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 329089, + "unit": "ns", + "extra": "gctime=0\nmemory=57056\nallocs=1259\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 423777, + "unit": "ns", + "extra": "gctime=0\nmemory=67296\nallocs=1097\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 833, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 834, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 19066, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 27311, + "unit": "ns", + "extra": "gctime=0\nmemory=1824\nallocs=66\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 1417, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 1417, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 1583, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 1375, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 116071.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12792\nallocs=482\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 118732, + "unit": "ns", + "extra": "gctime=0\nmemory=12920\nallocs=286\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7375, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 6000, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6083, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10334, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 24482, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 52122, + "unit": "ns", + "extra": "gctime=0\nmemory=6816\nallocs=181\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 229541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 268417, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 241500, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 251250, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 189293, + "unit": "ns", + "extra": "gctime=0\nmemory=21400\nallocs=648\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 588480, + "unit": "ns", + "extra": "gctime=0\nmemory=78632\nallocs=1798\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3958, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 4042, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 23660.5, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 43502, + "unit": "ns", + "extra": "gctime=0\nmemory=1552\nallocs=52\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 16833, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 16834, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 16959, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 16666, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 188039, + "unit": "ns", + "extra": "gctime=0\nmemory=13864\nallocs=504\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 166010.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13208\nallocs=311\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 929291, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 838708, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 841584, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 1269208, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 113941, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 396441, + "unit": "ns", + "extra": "gctime=0\nmemory=8240\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 2610729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 2330541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 2324458, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 3478334, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 232093, + "unit": "ns", + "extra": "gctime=0\nmemory=16520\nallocs=621\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 630643.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15560\nallocs=427\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 6000, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7042, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 7333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 6584, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 82915, + "unit": "ns", + "extra": "gctime=0\nmemory=9952\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 62131.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10928\nallocs=272\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 11875, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 11417, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 12417, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9813, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 585345.5, + "unit": "ns", + "extra": "gctime=0\nmemory=64240\nallocs=2453\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 388046, + "unit": "ns", + "extra": "gctime=0\nmemory=54368\nallocs=1165\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 23179.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 41949, + "unit": "ns", + "extra": "gctime=0\nmemory=1760\nallocs=62\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2083, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2250, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 2167, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2083, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 226220, + "unit": "ns", + "extra": "gctime=0\nmemory=14008\nallocs=513\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 166171, + "unit": "ns", + "extra": "gctime=0\nmemory=14344\nallocs=339\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 8583, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 8542, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10709, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 8833, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 100758, + "unit": "ns", + "extra": "gctime=0\nmemory=10256\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 72575, + "unit": "ns", + "extra": "gctime=0\nmemory=15584\nallocs=317\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 17228.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 18583, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 18500, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 17750, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 582511, + "unit": "ns", + "extra": "gctime=0\nmemory=65864\nallocs=1471\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 371318.5, + "unit": "ns", + "extra": "gctime=0\nmemory=77080\nallocs=1252\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 459, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 34079, + "unit": "ns", + "extra": "gctime=0\nmemory=4688\nallocs=156\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 44423, + "unit": "ns", + "extra": "gctime=0\nmemory=6128\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9479, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9750, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10333, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 262881, + "unit": "ns", + "extra": "gctime=0\nmemory=21712\nallocs=664\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 351422, + "unit": "ns", + "extra": "gctime=0\nmemory=48464\nallocs=1171\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)", + "value": 396583, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)", + "value": 288042, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)", + "value": 287666, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)", + "value": 756167, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA", + "value": 112987, + "unit": "ns", + "extra": "gctime=0\nmemory=1168\nallocs=48\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU", + "value": 77780.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1600\nallocs=91\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1455709, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1130291, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1133250, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2358000, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA", + "value": 202802, + "unit": "ns", + "extra": "gctime=0\nmemory=12240\nallocs=440\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU", + "value": 268682, + "unit": "ns", + "extra": "gctime=0\nmemory=12720\nallocs=366\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 7354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 8000, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 8687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 137305, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=455\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 64461, + "unit": "ns", + "extra": "gctime=0\nmemory=10320\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 12812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 15041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 15353.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 12333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 906003, + "unit": "ns", + "extra": "gctime=0\nmemory=68592\nallocs=2636\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 413373, + "unit": "ns", + "extra": "gctime=0\nmemory=59472\nallocs=1255\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 26000, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 27562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 27042, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 26021, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 186382.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13152\nallocs=479\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 146484, + "unit": "ns", + "extra": "gctime=0\nmemory=18384\nallocs=401\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 146500, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 157750, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 129416, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 155812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1016426, + "unit": "ns", + "extra": "gctime=0\nmemory=70912\nallocs=2543\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 551090, + "unit": "ns", + "extra": "gctime=0\nmemory=71040\nallocs=1431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 84667, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 80167, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 78063, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 80521, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 190829, + "unit": "ns", + "extra": "gctime=0\nmemory=14112\nallocs=539\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 124858.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18288\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 219479, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 281750, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 278146, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 320791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1021778, + "unit": "ns", + "extra": "gctime=0\nmemory=75744\nallocs=2816\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 643542, + "unit": "ns", + "extra": "gctime=0\nmemory=76656\nallocs=1565\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 13125, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 13666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 14041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 13459, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 136741.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=455\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 226473, + "unit": "ns", + "extra": "gctime=0\nmemory=13632\nallocs=441\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 27083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 26125, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 27833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 26604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 919419, + "unit": "ns", + "extra": "gctime=0\nmemory=69488\nallocs=2670\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 633979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60240\nallocs=1281\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 14000, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 14708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 17583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 14792, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 119245, + "unit": "ns", + "extra": "gctime=0\nmemory=10256\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 233827, + "unit": "ns", + "extra": "gctime=0\nmemory=16192\nallocs=355\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 26875, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 25958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 26583, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 26541, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 676576, + "unit": "ns", + "extra": "gctime=0\nmemory=54152\nallocs=1507\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 589361.5, + "unit": "ns", + "extra": "gctime=0\nmemory=64616\nallocs=1262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 182375, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 183208, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 185583, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 183459, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 102955, + "unit": "ns", + "extra": "gctime=0\nmemory=9920\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 232900.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19440\nallocs=508\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 583500, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 595083, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 597520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 624167, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 493717.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48824\nallocs=1372\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 657463, + "unit": "ns", + "extra": "gctime=0\nmemory=58280\nallocs=1154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 6750, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 8167, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 7542, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 135360, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=455\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 62767, + "unit": "ns", + "extra": "gctime=0\nmemory=10896\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 15375, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14917, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 16187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 15292, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 885601, + "unit": "ns", + "extra": "gctime=0\nmemory=65680\nallocs=2520\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 392428, + "unit": "ns", + "extra": "gctime=0\nmemory=57360\nallocs=1201\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)", + "value": 6153416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)", + "value": 6381624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)", + "value": 6371521, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)", + "value": 11926500, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/GPU/CUDA", + "value": 346494, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/GPU/AMDGPU", + "value": 392843, + "unit": "ns", + "extra": "gctime=0\nmemory=5184\nallocs=276\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 19117208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 19977084, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 19957021, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 36558729, + "unit": "ns", + "extra": "gctime=0\nmemory=16778128\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/GPU/CUDA", + "value": 1005649, + "unit": "ns", + "extra": "gctime=0\nmemory=10128\nallocs=369\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU", + "value": 1105996, + "unit": "ns", + "extra": "gctime=0\nmemory=20528\nallocs=604\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1750, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 1833, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 23503, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 197739, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=190\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 4834, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 4958, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 4917, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 4916, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 276337.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16136\nallocs=603\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 502208, + "unit": "ns", + "extra": "gctime=0\nmemory=16456\nallocs=425\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8416, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 9459, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 115989, + "unit": "ns", + "extra": "gctime=0\nmemory=10128\nallocs=262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 71584, + "unit": "ns", + "extra": "gctime=0\nmemory=15344\nallocs=302\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 11562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 12438, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 12541, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 12875, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 604320, + "unit": "ns", + "extra": "gctime=0\nmemory=64024\nallocs=1407\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 353160, + "unit": "ns", + "extra": "gctime=0\nmemory=73688\nallocs=1155\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 22648, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 43592, + "unit": "ns", + "extra": "gctime=0\nmemory=1216\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2917, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2917, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3041, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3000, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 197848, + "unit": "ns", + "extra": "gctime=0\nmemory=12280\nallocs=456\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 146363.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10136\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 14604, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 15458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 15896, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 15000.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 117481, + "unit": "ns", + "extra": "gctime=0\nmemory=9568\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 236802, + "unit": "ns", + "extra": "gctime=0\nmemory=18304\nallocs=512\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 26500, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 25625, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 26041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 25958, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 561217, + "unit": "ns", + "extra": "gctime=0\nmemory=47496\nallocs=1372\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 566814, + "unit": "ns", + "extra": "gctime=0\nmemory=55000\nallocs=1105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 4291, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 4209, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 4208, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 4375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 24363, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 44754, + "unit": "ns", + "extra": "gctime=0\nmemory=1488\nallocs=45\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 16250, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 16125, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 16292, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 16416, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 321227, + "unit": "ns", + "extra": "gctime=0\nmemory=15592\nallocs=561\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 190786, + "unit": "ns", + "extra": "gctime=0\nmemory=17352\nallocs=412\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 5916, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 5875, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 5792, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 5750, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 34700.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4688\nallocs=156\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 200434, + "unit": "ns", + "extra": "gctime=0\nmemory=7520\nallocs=238\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 22292, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 21292, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 21792, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 22208, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 283315.5, + "unit": "ns", + "extra": "gctime=0\nmemory=24952\nallocs=810\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 598489, + "unit": "ns", + "extra": "gctime=0\nmemory=50008\nallocs=1214\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)", + "value": 59729, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)", + "value": 64229, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)", + "value": 66833, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)", + "value": 50958, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/GPU/CUDA", + "value": 66908, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/GPU/AMDGPU", + "value": 115781, + "unit": "ns", + "extra": "gctime=0\nmemory=13904\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 198937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 144625, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 167291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 303249.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2098064\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/GPU/CUDA", + "value": 208882.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU", + "value": 529218, + "unit": "ns", + "extra": "gctime=0\nmemory=55504\nallocs=468\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 84291, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 83875, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 88125, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 81562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 193291, + "unit": "ns", + "extra": "gctime=0\nmemory=9936\nallocs=247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 182771, + "unit": "ns", + "extra": "gctime=0\nmemory=24512\nallocs=531\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1875250, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1914792, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1928375, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1916625, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 505449, + "unit": "ns", + "extra": "gctime=0\nmemory=57088\nallocs=1261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 857542, + "unit": "ns", + "extra": "gctime=0\nmemory=85024\nallocs=1498\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA", + "value": 21535, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU", + "value": 36788, + "unit": "ns", + "extra": "gctime=0\nmemory=1760\nallocs=65\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 1833, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 1875, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA", + "value": 243998, + "unit": "ns", + "extra": "gctime=0\nmemory=11856\nallocs=422\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU", + "value": 166221, + "unit": "ns", + "extra": "gctime=0\nmemory=12960\nallocs=287\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 11229, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 9791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 11125, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 10479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 114440.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9440\nallocs=238\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 233386, + "unit": "ns", + "extra": "gctime=0\nmemory=17760\nallocs=478\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 10458, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10250, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9917, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 491014, + "unit": "ns", + "extra": "gctime=0\nmemory=45656\nallocs=1308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 561274, + "unit": "ns", + "extra": "gctime=0\nmemory=51672\nallocs=1024\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 58375, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 46917, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46625, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 83708, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 38960, + "unit": "ns", + "extra": "gctime=0\nmemory=3376\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 72876, + "unit": "ns", + "extra": "gctime=0\nmemory=6672\nallocs=172\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1897625, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1964750, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1985854, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1899833, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 212091, + "unit": "ns", + "extra": "gctime=0\nmemory=19888\nallocs=571\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 994598, + "unit": "ns", + "extra": "gctime=0\nmemory=82016\nallocs=1907\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 266354, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 269729, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 271041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 268271, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 193629.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14112\nallocs=539\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 271156, + "unit": "ns", + "extra": "gctime=0\nmemory=19504\nallocs=471\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 693917, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 692541, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 687708, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 593833, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 991006, + "unit": "ns", + "extra": "gctime=0\nmemory=76640\nallocs=2850\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 863163, + "unit": "ns", + "extra": "gctime=0\nmemory=76912\nallocs=1563\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 2180687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 2214917, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 2212041, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 2208479, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 154859, + "unit": "ns", + "extra": "gctime=0\nmemory=14112\nallocs=539\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 451844.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18688\nallocs=420\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 5453666, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 5518208, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 5522375, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 5522209, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 930442, + "unit": "ns", + "extra": "gctime=0\nmemory=76640\nallocs=2850\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1495900, + "unit": "ns", + "extra": "gctime=0\nmemory=80368\nallocs=1632\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 999875, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 913333, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 912895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 1334562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 46425, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 399125, + "unit": "ns", + "extra": "gctime=0\nmemory=8320\nallocs=330\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 2620166, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 2328541, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 2329395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 3468667, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 247327, + "unit": "ns", + "extra": "gctime=0\nmemory=18040\nallocs=671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 658089, + "unit": "ns", + "extra": "gctime=0\nmemory=19768\nallocs=517\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 58083, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 46625, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46542, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 84000, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 29007, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 73392, + "unit": "ns", + "extra": "gctime=0\nmemory=7056\nallocs=196\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2036000, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2096916, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2092208, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1992542, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 225482, + "unit": "ns", + "extra": "gctime=0\nmemory=21400\nallocs=648\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1028937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=83816\nallocs=1975\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 58417, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 47208, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 47375, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 83541, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 48550, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 71593.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6784\nallocs=189\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1926354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1987291, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1972375, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1890375, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 231977, + "unit": "ns", + "extra": "gctime=0\nmemory=21200\nallocs=635\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 931260, + "unit": "ns", + "extra": "gctime=0\nmemory=79456\nallocs=1646\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 33752, + "unit": "ns", + "extra": "gctime=0\nmemory=4688\nallocs=156\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 44343, + "unit": "ns", + "extra": "gctime=0\nmemory=6416\nallocs=169\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6542, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7625, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 6209, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 203191.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20128\nallocs=616\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 350064, + "unit": "ns", + "extra": "gctime=0\nmemory=46048\nallocs=1135\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA", + "value": 32755, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU", + "value": 36558, + "unit": "ns", + "extra": "gctime=0\nmemory=1040\nallocs=56\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3375, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 3333, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3000, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3208, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA", + "value": 185298.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10176\nallocs=362\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU", + "value": 144480, + "unit": "ns", + "extra": "gctime=0\nmemory=8704\nallocs=230\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 1465479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 1410667, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 1427770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 1410417, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 136084, + "unit": "ns", + "extra": "gctime=0\nmemory=10624\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 354201, + "unit": "ns", + "extra": "gctime=0\nmemory=30528\nallocs=744\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 5012687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 5023959, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 5034167, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 5021667, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 673868, + "unit": "ns", + "extra": "gctime=0\nmemory=58744\nallocs=1605\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1145811, + "unit": "ns", + "extra": "gctime=0\nmemory=92520\nallocs=1865\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)", + "value": 49876625, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)", + "value": 35509791, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)", + "value": 35514916, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)", + "value": 97103375, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/GPU/CUDA", + "value": 1608361, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/GPU/AMDGPU", + "value": 1576726, + "unit": "ns", + "extra": "gctime=0\nmemory=9328\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 154443875, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 112320833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 112445042, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 296071750, + "unit": "ns", + "extra": "gctime=0\nmemory=134218640\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/GPU/CUDA", + "value": 6483041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10144\nallocs=370\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU", + "value": 6222525, + "unit": "ns", + "extra": "gctime=0\nmemory=27072\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)", + "value": 48042, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)", + "value": 47667, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)", + "value": 47916, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)", + "value": 47583, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA", + "value": 19626, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU", + "value": 28463, + "unit": "ns", + "extra": "gctime=0\nmemory=1760\nallocs=62\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 50583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 50167, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 51000, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 50667, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA", + "value": 245482, + "unit": "ns", + "extra": "gctime=0\nmemory=14376\nallocs=530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU", + "value": 140773, + "unit": "ns", + "extra": "gctime=0\nmemory=16264\nallocs=380\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8667, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8750, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 11167, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 9666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 118847, + "unit": "ns", + "extra": "gctime=0\nmemory=10128\nallocs=262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 237489, + "unit": "ns", + "extra": "gctime=0\nmemory=18096\nallocs=474\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 10791, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10458, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10333, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10709, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 584310, + "unit": "ns", + "extra": "gctime=0\nmemory=52312\nallocs=1443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 572469, + "unit": "ns", + "extra": "gctime=0\nmemory=61160\nallocs=1161\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9125, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9896, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10667, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9292, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 115727.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9568\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 73908, + "unit": "ns", + "extra": "gctime=0\nmemory=15120\nallocs=313\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 13874.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 13750, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 14333, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 14375.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 559680.5, + "unit": "ns", + "extra": "gctime=0\nmemory=45896\nallocs=1303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 337060, + "unit": "ns", + "extra": "gctime=0\nmemory=53864\nallocs=1056\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 959, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 1042, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 1042, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 1083, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 33675, + "unit": "ns", + "extra": "gctime=0\nmemory=4688\nallocs=156\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 206546, + "unit": "ns", + "extra": "gctime=0\nmemory=8064\nallocs=272\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8917, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8791, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 225862.5, + "unit": "ns", + "extra": "gctime=0\nmemory=23368\nallocs=762\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 576667, + "unit": "ns", + "extra": "gctime=0\nmemory=47224\nallocs=1155\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 23667, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 23292, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 23813, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 23666, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 20529, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 187811, + "unit": "ns", + "extra": "gctime=0\nmemory=4000\nallocs=202\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 53583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=68064\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 52145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=68064\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 53584, + "unit": "ns", + "extra": "gctime=0\nmemory=68064\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 53667, + "unit": "ns", + "extra": "gctime=0\nmemory=68064\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 260507, + "unit": "ns", + "extra": "gctime=0\nmemory=16504\nallocs=620\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 549086, + "unit": "ns", + "extra": "gctime=0\nmemory=17624\nallocs=416\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 1444541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 1445459, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 1414666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 1401396, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 195236, + "unit": "ns", + "extra": "gctime=0\nmemory=9936\nallocs=247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 321861, + "unit": "ns", + "extra": "gctime=0\nmemory=30112\nallocs=739\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 5007208, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 5006958, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 5015812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 5020500, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 510108, + "unit": "ns", + "extra": "gctime=0\nmemory=48856\nallocs=1374\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1117899, + "unit": "ns", + "extra": "gctime=0\nmemory=76072\nallocs=1559\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)", + "value": 828285625, + "unit": "ns", + "extra": "gctime=48649916.5\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)", + "value": 541921375, + "unit": "ns", + "extra": "gctime=415459\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)", + "value": 542359625, + "unit": "ns", + "extra": "gctime=434292\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)", + "value": 1558200021, + "unit": "ns", + "extra": "gctime=49318645.5\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/GPU/CUDA", + "value": 22535776.5, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/GPU/AMDGPU", + "value": 12173703, + "unit": "ns", + "extra": "gctime=0\nmemory=20992\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 3903695416, + "unit": "ns", + "extra": "gctime=83559833\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 1771980416, + "unit": "ns", + "extra": "gctime=30872292\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 1773568584, + "unit": "ns", + "extra": "gctime=31709208\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 5228367459, + "unit": "ns", + "extra": "gctime=51047999\nmemory=2147484560\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/GPU/CUDA", + "value": 119027931, + "unit": "ns", + "extra": "gctime=0\nmemory=10144\nallocs=370\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU", + "value": 68450588, + "unit": "ns", + "extra": "gctime=0\nmemory=65536\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 75916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 87437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 84417, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 81083, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 192111.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14112\nallocs=539\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 126607, + "unit": "ns", + "extra": "gctime=0\nmemory=18320\nallocs=397\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 282646, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 283042, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 236875, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 276458, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 995625, + "unit": "ns", + "extra": "gctime=0\nmemory=72960\nallocs=2700\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 612404, + "unit": "ns", + "extra": "gctime=0\nmemory=74544\nallocs=1507\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)", + "value": 199947208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)", + "value": 139420500, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)", + "value": 138954958, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)", + "value": 389188834, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/GPU/CUDA", + "value": 5832800, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/GPU/AMDGPU", + "value": 2958637.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11584\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 618298396, + "unit": "ns", + "extra": "gctime=2689896.5\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 439277916, + "unit": "ns", + "extra": "gctime=2681000\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 439303895.5, + "unit": "ns", + "extra": "gctime=2648792\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 1200068000, + "unit": "ns", + "extra": "gctime=3297084\nmemory=536871824\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/GPU/CUDA", + "value": 26614249.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10144\nallocs=370\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU", + "value": 16011697.5, + "unit": "ns", + "extra": "gctime=0\nmemory=37312\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7417, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 6125, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6125, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10125, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 26885, + "unit": "ns", + "extra": "gctime=0\nmemory=3376\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 54341, + "unit": "ns", + "extra": "gctime=0\nmemory=6624\nallocs=169\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 214083, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 232833, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 230000, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 207709, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 215596, + "unit": "ns", + "extra": "gctime=0\nmemory=19888\nallocs=571\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 546726.5, + "unit": "ns", + "extra": "gctime=0\nmemory=76944\nallocs=1737\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 7417, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8875.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10750, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 10459, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 111291, + "unit": "ns", + "extra": "gctime=0\nmemory=9440\nallocs=238\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 72956, + "unit": "ns", + "extra": "gctime=0\nmemory=15056\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7792, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8125, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8375, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 492517.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44056\nallocs=1239\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 322723, + "unit": "ns", + "extra": "gctime=0\nmemory=50728\nallocs=975\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 417, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 459, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 25272, + "unit": "ns", + "extra": "gctime=0\nmemory=3648\nallocs=118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 45194, + "unit": "ns", + "extra": "gctime=0\nmemory=6080\nallocs=138\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9646, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9541, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 11104, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10333, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 247083, + "unit": "ns", + "extra": "gctime=0\nmemory=20736\nallocs=600\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 383457, + "unit": "ns", + "extra": "gctime=0\nmemory=54960\nallocs=1329\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 351000, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 354459, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 352250, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 351625, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 23168, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 198701, + "unit": "ns", + "extra": "gctime=0\nmemory=4704\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 826000, + "unit": "ns", + "extra": "gctime=0\nmemory=1052384\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 820458, + "unit": "ns", + "extra": "gctime=0\nmemory=1052384\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 822083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052384\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 827750, + "unit": "ns", + "extra": "gctime=0\nmemory=1052384\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 214195.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16504\nallocs=620\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 578901, + "unit": "ns", + "extra": "gctime=0\nmemory=19000\nallocs=502\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)", + "value": 5229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)", + "value": 5875, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)", + "value": 6958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)", + "value": 4667, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/GPU/CUDA", + "value": 17091, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/GPU/AMDGPU", + "value": 74219, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=82\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 13458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 10625, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 13041, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 18542, + "unit": "ns", + "extra": "gctime=0\nmemory=131984\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/GPU/CUDA", + "value": 202239.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU", + "value": 330217, + "unit": "ns", + "extra": "gctime=0\nmemory=17120\nallocs=389\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)", + "value": 39833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)", + "value": 51209, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)", + "value": 52458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)", + "value": 13459, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/GPU/CUDA", + "value": 19993, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/GPU/AMDGPU", + "value": 99666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=5088\nallocs=84\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 38229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 35125, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 34187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 59417, + "unit": "ns", + "extra": "gctime=0\nmemory=525200\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/GPU/CUDA", + "value": 178995.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU", + "value": 362888, + "unit": "ns", + "extra": "gctime=0\nmemory=23792\nallocs=381\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)", + "value": 3500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)", + "value": 3667, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)", + "value": 3833, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)", + "value": 3709, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA", + "value": 19015, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU", + "value": 29645, + "unit": "ns", + "extra": "gctime=0\nmemory=1824\nallocs=66\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 4291, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 4500, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 4458, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 4292, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA", + "value": 194611, + "unit": "ns", + "extra": "gctime=0\nmemory=12792\nallocs=482\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU", + "value": 126757, + "unit": "ns", + "extra": "gctime=0\nmemory=12872\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 5916, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6375, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 4625, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 138395, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=455\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 65944, + "unit": "ns", + "extra": "gctime=0\nmemory=10928\nallocs=272\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9625, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8500, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9333, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10666, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 807046.5, + "unit": "ns", + "extra": "gctime=0\nmemory=64096\nallocs=2472\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 378457, + "unit": "ns", + "extra": "gctime=0\nmemory=55024\nallocs=1170\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 207583, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 209042, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 213208, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 204125, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 35332, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 203930.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8784\nallocs=314\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 603500, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 623479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 658604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 586375, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 254148, + "unit": "ns", + "extra": "gctime=0\nmemory=24552\nallocs=781\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 767213, + "unit": "ns", + "extra": "gctime=0\nmemory=77160\nallocs=1596\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)", + "value": 3324167, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)", + "value": 2328667, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)", + "value": 2334417, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)", + "value": 6324542, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/GPU/CUDA", + "value": 206559, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/GPU/AMDGPU", + "value": 377105, + "unit": "ns", + "extra": "gctime=0\nmemory=8016\nallocs=267\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 11496208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 8303562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 8348416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 21193020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555344\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/GPU/CUDA", + "value": 736080.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU", + "value": 2044820.5, + "unit": "ns", + "extra": "gctime=0\nmemory=30752\nallocs=667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5292, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6292, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 7125, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 129442, + "unit": "ns", + "extra": "gctime=0\nmemory=9952\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 57067, + "unit": "ns", + "extra": "gctime=0\nmemory=10480\nallocs=244\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8500, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7375, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7833, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 711410, + "unit": "ns", + "extra": "gctime=0\nmemory=59776\nallocs=2289\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 364581, + "unit": "ns", + "extra": "gctime=0\nmemory=49776\nallocs=1069\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 117312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 101437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 102687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 98458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 149616, + "unit": "ns", + "extra": "gctime=0\nmemory=10624\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 210473, + "unit": "ns", + "extra": "gctime=0\nmemory=24768\nallocs=522\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2008250, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2022459, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2039937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2036625, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 661994.5, + "unit": "ns", + "extra": "gctime=0\nmemory=57144\nallocs=1536\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 963831, + "unit": "ns", + "extra": "gctime=0\nmemory=91080\nallocs=1785\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)", + "value": 33416, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)", + "value": 35459, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)", + "value": 34709, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)", + "value": 750, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/GPU/CUDA", + "value": 15265, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/GPU/AMDGPU", + "value": 78737, + "unit": "ns", + "extra": "gctime=0\nmemory=1808\nallocs=65\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 3959, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 2917, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 4708, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 3666, + "unit": "ns", + "extra": "gctime=0\nmemory=1168\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/GPU/CUDA", + "value": 136137.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8496\nallocs=319\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU", + "value": 321796.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11792\nallocs=320\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 6042, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6083, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 34970, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 56516, + "unit": "ns", + "extra": "gctime=0\nmemory=6336\nallocs=161\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 221584, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 220959, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 234583, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 207333, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 237194, + "unit": "ns", + "extra": "gctime=0\nmemory=21200\nallocs=635\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 540189, + "unit": "ns", + "extra": "gctime=0\nmemory=74352\nallocs=1474\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3750, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)", + "value": 3750, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3833, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3958, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA", + "value": 21681, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU", + "value": 39383, + "unit": "ns", + "extra": "gctime=0\nmemory=1648\nallocs=58\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 14458, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 14458, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 14541, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 14625, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA", + "value": 297631.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13440\nallocs=470\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU", + "value": 190215, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=356\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 129834, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 118271, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 106750, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 101666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 150106, + "unit": "ns", + "extra": "gctime=0\nmemory=10624\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 241781, + "unit": "ns", + "extra": "gctime=0\nmemory=24656\nallocs=515\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1921708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1924583, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1932000, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1922750, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 653385, + "unit": "ns", + "extra": "gctime=0\nmemory=55632\nallocs=1459\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 928325, + "unit": "ns", + "extra": "gctime=0\nmemory=90528\nallocs=1796\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 18875, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 17292, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 20937, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 18459, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 104073.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9920\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 91301, + "unit": "ns", + "extra": "gctime=0\nmemory=16544\nallocs=327\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 239083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 224791, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 224958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 218500, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 493640.5, + "unit": "ns", + "extra": "gctime=0\nmemory=47224\nallocs=1303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 439080, + "unit": "ns", + "extra": "gctime=0\nmemory=56536\nallocs=1055\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)", + "value": 26166, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)", + "value": 29167, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)", + "value": 28958, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)", + "value": 1416, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/GPU/CUDA", + "value": 15781, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/GPU/AMDGPU", + "value": 72756, + "unit": "ns", + "extra": "gctime=0\nmemory=2160\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 6208, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 5041, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 6875, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 6417, + "unit": "ns", + "extra": "gctime=0\nmemory=18064\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/GPU/CUDA", + "value": 199155.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU", + "value": 324216, + "unit": "ns", + "extra": "gctime=0\nmemory=15280\nallocs=409\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 221875, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 223375, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 225375, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 223542, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 216803, + "unit": "ns", + "extra": "gctime=0\nmemory=13152\nallocs=479\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 267771, + "unit": "ns", + "extra": "gctime=0\nmemory=19536\nallocs=473\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 508542, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 511042, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 509500, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 557354, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1017707.5, + "unit": "ns", + "extra": "gctime=0\nmemory=72096\nallocs=2667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 811461, + "unit": "ns", + "extra": "gctime=0\nmemory=70768\nallocs=1452\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 19104, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 19584, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 22063, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 19792, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 111072, + "unit": "ns", + "extra": "gctime=0\nmemory=10608\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 90009, + "unit": "ns", + "extra": "gctime=0\nmemory=17184\nallocs=342\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 221854, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 220250, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 218166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 220146, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 700847.5, + "unit": "ns", + "extra": "gctime=0\nmemory=55600\nallocs=1457\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 494855, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=1340\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 6292, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7000, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 7375, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 6834, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 130925, + "unit": "ns", + "extra": "gctime=0\nmemory=9952\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 63498, + "unit": "ns", + "extra": "gctime=0\nmemory=10864\nallocs=268\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 11041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9959, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10459, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 770540.5, + "unit": "ns", + "extra": "gctime=0\nmemory=61360\nallocs=2337\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 375452, + "unit": "ns", + "extra": "gctime=0\nmemory=52624\nallocs=1132\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 4104, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 7041, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7166, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 6166, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 131485.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9952\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 62607, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8125, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8083, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 737449, + "unit": "ns", + "extra": "gctime=0\nmemory=62656\nallocs=2405\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 380902, + "unit": "ns", + "extra": "gctime=0\nmemory=51680\nallocs=1112\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)", + "value": 14481917, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)", + "value": 10107542, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)", + "value": 10094750, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)", + "value": 27859959, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/GPU/CUDA", + "value": 533975, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/GPU/AMDGPU", + "value": 867906.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20992\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 46387667, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 33363354, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 33478875, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 85752792, + "unit": "ns", + "extra": "gctime=0\nmemory=134218640\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/GPU/CUDA", + "value": 2651799, + "unit": "ns", + "extra": "gctime=0\nmemory=10144\nallocs=370\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU", + "value": 5191497.5, + "unit": "ns", + "extra": "gctime=0\nmemory=62064\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 185208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 185916, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 188604, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 187271, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 117719.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10608\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 236051, + "unit": "ns", + "extra": "gctime=0\nmemory=19232\nallocs=470\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 634875, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 627937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 601166, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 587625, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 694993, + "unit": "ns", + "extra": "gctime=0\nmemory=58712\nallocs=1603\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 698169.5, + "unit": "ns", + "extra": "gctime=0\nmemory=73544\nallocs=1386\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 31826, + "unit": "ns", + "extra": "gctime=0\nmemory=4112\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 48104.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6480\nallocs=173\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9541, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10542, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10938, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 276120, + "unit": "ns", + "extra": "gctime=0\nmemory=23208\nallocs=741\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 371078, + "unit": "ns", + "extra": "gctime=0\nmemory=49640\nallocs=1200\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 26250, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 26333, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 26583, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 26458, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 22942, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 206526, + "unit": "ns", + "extra": "gctime=0\nmemory=2928\nallocs=138\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 67125, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 67333, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 68792, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 66875, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 273858, + "unit": "ns", + "extra": "gctime=0\nmemory=16040\nallocs=591\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 554115, + "unit": "ns", + "extra": "gctime=0\nmemory=14184\nallocs=359\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 207166, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 211667, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 211167, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 202875, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 27563, + "unit": "ns", + "extra": "gctime=0\nmemory=3376\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 206546, + "unit": "ns", + "extra": "gctime=0\nmemory=8336\nallocs=276\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 609937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 669750, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 664812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 609042, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 233231.5, + "unit": "ns", + "extra": "gctime=0\nmemory=23240\nallocs=717\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 798562, + "unit": "ns", + "extra": "gctime=0\nmemory=79464\nallocs=1841\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 664875, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 636687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 648791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 629792, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 185894.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14112\nallocs=539\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 349393, + "unit": "ns", + "extra": "gctime=0\nmemory=19472\nallocs=469\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2244229, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2225354, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2256708, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2271792, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 900927, + "unit": "ns", + "extra": "gctime=0\nmemory=72960\nallocs=2700\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1235829, + "unit": "ns", + "extra": "gctime=0\nmemory=78432\nallocs=1595\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 19333, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 21166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 22375, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 19958, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 106770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10608\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 89387, + "unit": "ns", + "extra": "gctime=0\nmemory=17120\nallocs=338\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 227250, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 262312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 231250, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 222770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 700957, + "unit": "ns", + "extra": "gctime=0\nmemory=57112\nallocs=1534\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 516550, + "unit": "ns", + "extra": "gctime=0\nmemory=73304\nallocs=1381\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 22928, + "unit": "ns", + "extra": "gctime=0\nmemory=3072\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 44243, + "unit": "ns", + "extra": "gctime=0\nmemory=6512\nallocs=165\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9583, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 13229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10875, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 258192, + "unit": "ns", + "extra": "gctime=0\nmemory=22232\nallocs=677\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 395479, + "unit": "ns", + "extra": "gctime=0\nmemory=56168\nallocs=1360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 9208, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10459, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8333, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 112863.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9440\nallocs=238\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 72315, + "unit": "ns", + "extra": "gctime=0\nmemory=15472\nallocs=335\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 14875, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8917, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 472419, + "unit": "ns", + "extra": "gctime=0\nmemory=42544\nallocs=1162\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 321811, + "unit": "ns", + "extra": "gctime=0\nmemory=49520\nallocs=941\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 2500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 2542, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 2416, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 19845, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 191508, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=178\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 6666, + "unit": "ns", + "extra": "gctime=0\nmemory=5728\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 6459, + "unit": "ns", + "extra": "gctime=0\nmemory=5728\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=5728\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=5728\nallocs=34\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 208409, + "unit": "ns", + "extra": "gctime=0\nmemory=14920\nallocs=572\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 543621, + "unit": "ns", + "extra": "gctime=0\nmemory=14232\nallocs=319\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)", + "value": 754167, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)", + "value": 751000, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)", + "value": 749375, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)", + "value": 747104, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA", + "value": 22303, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU", + "value": 47829, + "unit": "ns", + "extra": "gctime=0\nmemory=1808\nallocs=65\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 792250, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 811750, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 789500, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 794229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA", + "value": 206590.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14376\nallocs=530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU", + "value": 233541, + "unit": "ns", + "extra": "gctime=0\nmemory=16232\nallocs=378\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5917, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6000, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10209, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 32976, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 57267, + "unit": "ns", + "extra": "gctime=0\nmemory=6448\nallocs=168\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 228458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 269270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 235021, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 213146, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 254662, + "unit": "ns", + "extra": "gctime=0\nmemory=22712\nallocs=712\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 552652, + "unit": "ns", + "extra": "gctime=0\nmemory=75992\nallocs=1532\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 12417, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 13250, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 14458, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 13000, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 131273.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9952\nallocs=395\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 231363, + "unit": "ns", + "extra": "gctime=0\nmemory=12288\nallocs=357\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 24854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 24916, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 25542, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 24458, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 813324, + "unit": "ns", + "extra": "gctime=0\nmemory=67600\nallocs=2513\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 634495, + "unit": "ns", + "extra": "gctime=0\nmemory=56992\nallocs=1171\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 8875, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9958, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 11167, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9542, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 116553, + "unit": "ns", + "extra": "gctime=0\nmemory=10256\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 74930, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=345\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 13770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14917, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 15916, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 16437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 621843, + "unit": "ns", + "extra": "gctime=0\nmemory=51040\nallocs=1361\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 356836, + "unit": "ns", + "extra": "gctime=0\nmemory=62272\nallocs=1171\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9354, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10750, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 10125, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 116468, + "unit": "ns", + "extra": "gctime=0\nmemory=9568\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 74383.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15184\nallocs=317\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 12916, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 12959, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 20541, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 14500, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 515709, + "unit": "ns", + "extra": "gctime=0\nmemory=44384\nallocs=1226\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 328534, + "unit": "ns", + "extra": "gctime=0\nmemory=52816\nallocs=1036\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)", + "value": 31062, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)", + "value": 33146, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)", + "value": 30750, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)", + "value": 1833, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/GPU/CUDA", + "value": 16169, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/GPU/AMDGPU", + "value": 77564, + "unit": "ns", + "extra": "gctime=0\nmemory=5152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 5562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 5312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 7208, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 7834, + "unit": "ns", + "extra": "gctime=0\nmemory=9872\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/GPU/CUDA", + "value": 134922, + "unit": "ns", + "extra": "gctime=0\nmemory=8496\nallocs=319\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU", + "value": 340125, + "unit": "ns", + "extra": "gctime=0\nmemory=23760\nallocs=363\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 24307, + "unit": "ns", + "extra": "gctime=0\nmemory=3648\nallocs=118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 45845, + "unit": "ns", + "extra": "gctime=0\nmemory=6624\nallocs=172\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 6708, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8167, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7083, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 179926.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19152\nallocs=552\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 372385.5, + "unit": "ns", + "extra": "gctime=0\nmemory=52192\nallocs=1271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 5834, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 5833, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 5875, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 5958, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 25187, + "unit": "ns", + "extra": "gctime=0\nmemory=3648\nallocs=118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 201636, + "unit": "ns", + "extra": "gctime=0\nmemory=7408\nallocs=221\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 21041, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 21709, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 23458, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 26125, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 262884, + "unit": "ns", + "extra": "gctime=0\nmemory=23816\nallocs=746\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 615780.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56616\nallocs=1379\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 192083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 158917, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 154416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 146417, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 184640, + "unit": "ns", + "extra": "gctime=0\nmemory=13152\nallocs=479\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 215472.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19168\nallocs=450\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1319792, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1328249.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1347250, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1337000, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 844907, + "unit": "ns", + "extra": "gctime=0\nmemory=71200\nallocs=2633\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1041340, + "unit": "ns", + "extra": "gctime=0\nmemory=74448\nallocs=1553\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 24292, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 24916, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 28000, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 24833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 224694.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13152\nallocs=479\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 130334, + "unit": "ns", + "extra": "gctime=0\nmemory=18336\nallocs=398\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 117583, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 131375, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 160499.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 164750, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 967206, + "unit": "ns", + "extra": "gctime=0\nmemory=71200\nallocs=2633\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 585053, + "unit": "ns", + "extra": "gctime=0\nmemory=70752\nallocs=1469\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 334, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 22932, + "unit": "ns", + "extra": "gctime=0\nmemory=3072\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 47870, + "unit": "ns", + "extra": "gctime=0\nmemory=6032\nallocs=135\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6292, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 6833, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9416, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 196587.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20648\nallocs=629\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 380031, + "unit": "ns", + "extra": "gctime=0\nmemory=52712\nallocs=1259\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 5875, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 6292, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 6562, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 134586, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=455\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 230170, + "unit": "ns", + "extra": "gctime=0\nmemory=10848\nallocs=267\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9833, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10000, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 11187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 11083, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 840176, + "unit": "ns", + "extra": "gctime=0\nmemory=67904\nallocs=2622\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 631290, + "unit": "ns", + "extra": "gctime=0\nmemory=57296\nallocs=1212\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1542, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1625, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 1625, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 1625, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 22272, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 204933, + "unit": "ns", + "extra": "gctime=0\nmemory=4256\nallocs=221\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 5750, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 6125, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 6417, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 5875, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 216977, + "unit": "ns", + "extra": "gctime=0\nmemory=14456\nallocs=543\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 491814.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11112\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8250, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 9895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 9209, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 115063, + "unit": "ns", + "extra": "gctime=0\nmemory=10128\nallocs=262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 73999, + "unit": "ns", + "extra": "gctime=0\nmemory=15808\nallocs=331\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8167, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10333, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 548589, + "unit": "ns", + "extra": "gctime=0\nmemory=49200\nallocs=1297\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 340367, + "unit": "ns", + "extra": "gctime=0\nmemory=59024\nallocs=1083\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)", + "value": 127271, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)", + "value": 128750, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)", + "value": 131062, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)", + "value": 181979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/GPU/CUDA", + "value": 46303.5, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/GPU/AMDGPU", + "value": 102121, + "unit": "ns", + "extra": "gctime=0\nmemory=2080\nallocs=82\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 338125, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 339792, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 346083, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 595417, + "unit": "ns", + "extra": "gctime=0\nmemory=1049488\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/GPU/CUDA", + "value": 181951, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU", + "value": 410627.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14720\nallocs=374\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)", + "value": 397708, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)", + "value": 288375, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)", + "value": 287937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)", + "value": 756708, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA", + "value": 43092, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU", + "value": 85671, + "unit": "ns", + "extra": "gctime=0\nmemory=2656\nallocs=121\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1456291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1133125, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1127937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2360208, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA", + "value": 248595.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=490\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU", + "value": 266317, + "unit": "ns", + "extra": "gctime=0\nmemory=16896\nallocs=436\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 643479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 654166, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 652750, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 650625, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 172424.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14112\nallocs=539\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 315089, + "unit": "ns", + "extra": "gctime=0\nmemory=19136\nallocs=448\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2449417, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2455020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2465625, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2469208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 922065, + "unit": "ns", + "extra": "gctime=0\nmemory=75744\nallocs=2816\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1363193.5, + "unit": "ns", + "extra": "gctime=0\nmemory=80608\nallocs=1665\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)", + "value": 32917, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)", + "value": 35374.5, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)", + "value": 34417, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)", + "value": 1000, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/GPU/CUDA", + "value": 15534, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/GPU/AMDGPU", + "value": 78366, + "unit": "ns", + "extra": "gctime=0\nmemory=2272\nallocs=49\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 2937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 3375, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 5208, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 4625, + "unit": "ns", + "extra": "gctime=0\nmemory=2960\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/GPU/CUDA", + "value": 133935.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8496\nallocs=319\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU", + "value": 318886, + "unit": "ns", + "extra": "gctime=0\nmemory=13600\nallocs=298\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 1464209, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 1500333, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 1501333, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 1442563, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 41738, + "unit": "ns", + "extra": "gctime=0\nmemory=3376\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 318625, + "unit": "ns", + "extra": "gctime=0\nmemory=13008\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 5128625, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 5291041, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 5297084, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 4998791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 230499.5, + "unit": "ns", + "extra": "gctime=0\nmemory=23240\nallocs=717\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1198280, + "unit": "ns", + "extra": "gctime=0\nmemory=84648\nallocs=2018\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3709, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)", + "value": 3750, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3750, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3916, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA", + "value": 33583, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU", + "value": 36778.5, + "unit": "ns", + "extra": "gctime=0\nmemory=928\nallocs=49\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 15417, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 15500, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 15791, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 16000, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA", + "value": 252278, + "unit": "ns", + "extra": "gctime=0\nmemory=11760\nallocs=410\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU", + "value": 161662, + "unit": "ns", + "extra": "gctime=0\nmemory=11632\nallocs=298\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 404625, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 296000, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 295916, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 760625, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 113161.5, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 95859, + "unit": "ns", + "extra": "gctime=0\nmemory=2240\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1479249.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1158584, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1160500, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2383354, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 228888, + "unit": "ns", + "extra": "gctime=0\nmemory=14184\nallocs=524\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 265922, + "unit": "ns", + "extra": "gctime=0\nmemory=14728\nallocs=406\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 958, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 1042, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 1042, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 1083, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 24404, + "unit": "ns", + "extra": "gctime=0\nmemory=3648\nallocs=118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 207859, + "unit": "ns", + "extra": "gctime=0\nmemory=9120\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7917, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8542, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9917, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 12895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 202191, + "unit": "ns", + "extra": "gctime=0\nmemory=22232\nallocs=698\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 620871, + "unit": "ns", + "extra": "gctime=0\nmemory=53672\nallocs=1310\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)", + "value": 835834, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)", + "value": 615542, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)", + "value": 617791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)", + "value": 1549375, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/GPU/CUDA", + "value": 130350.5, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/GPU/AMDGPU", + "value": 215532, + "unit": "ns", + "extra": "gctime=0\nmemory=3408\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 2690375, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 2000479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 2007416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 4941104, + "unit": "ns", + "extra": "gctime=0\nmemory=8389520\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/GPU/CUDA", + "value": 232712, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU", + "value": 872871.5, + "unit": "ns", + "extra": "gctime=0\nmemory=21216\nallocs=512\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 31625, + "unit": "ns", + "extra": "gctime=0\nmemory=4112\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 47950, + "unit": "ns", + "extra": "gctime=0\nmemory=6000\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6084, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 6708, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7666, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8083, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 221856.5, + "unit": "ns", + "extra": "gctime=0\nmemory=21624\nallocs=693\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 352319, + "unit": "ns", + "extra": "gctime=0\nmemory=46360\nallocs=1110\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 1741791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 1752167, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 1739042, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 1719916, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 183055.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13152\nallocs=479\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 415606.5, + "unit": "ns", + "extra": "gctime=0\nmemory=24896\nallocs=666\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 4361125, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 4365916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 4399333, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 4394333, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 827645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=72096\nallocs=2667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1239667.5, + "unit": "ns", + "extra": "gctime=0\nmemory=74640\nallocs=1547\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 7083, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 7395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 7041, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 6854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 22223.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2144\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 47178, + "unit": "ns", + "extra": "gctime=0\nmemory=1808\nallocs=65\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 45292, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 51167, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 49250, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 49437, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 204846, + "unit": "ns", + "extra": "gctime=0\nmemory=14376\nallocs=530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 235841, + "unit": "ns", + "extra": "gctime=0\nmemory=15976\nallocs=362\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)", + "value": 22125, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)", + "value": 25125, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)", + "value": 24833, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)", + "value": 5458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/GPU/CUDA", + "value": 17859, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/GPU/AMDGPU", + "value": 82154, + "unit": "ns", + "extra": "gctime=0\nmemory=14432\nallocs=80\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 11792, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 10750, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 12583, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 19708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34448\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/GPU/CUDA", + "value": 216235, + "unit": "ns", + "extra": "gctime=0\nmemory=10112\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU", + "value": 331099, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=404\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 406250, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 297333, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 296833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 762833, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 46303.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 97252, + "unit": "ns", + "extra": "gctime=0\nmemory=2608\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1477458, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1164395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1164416, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2386333, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 268961, + "unit": "ns", + "extra": "gctime=0\nmemory=15912\nallocs=581\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 282959, + "unit": "ns", + "extra": "gctime=0\nmemory=18376\nallocs=476\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 1488416, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 1526958, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 1529250, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 1466395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 52650, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 326982, + "unit": "ns", + "extra": "gctime=0\nmemory=12896\nallocs=429\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 5119459, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 5285084, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 5297709, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 4955208, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 250192, + "unit": "ns", + "extra": "gctime=0\nmemory=24552\nallocs=781\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1186136, + "unit": "ns", + "extra": "gctime=0\nmemory=82744\nallocs=1780\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 28292, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 28292, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 28333, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 28417, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 23514.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 207227, + "unit": "ns", + "extra": "gctime=0\nmemory=2976\nallocs=138\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 66542, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 66750, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 66500, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 66208, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 333506.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17720\nallocs=651\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 576948.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18632\nallocs=446\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 124875, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 81875, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 89166, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 86750, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 191648, + "unit": "ns", + "extra": "gctime=0\nmemory=9936\nallocs=247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 233116, + "unit": "ns", + "extra": "gctime=0\nmemory=23984\nallocs=498\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2025145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2021978.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2030542, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1995125, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 506195, + "unit": "ns", + "extra": "gctime=0\nmemory=47256\nallocs=1305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 881973, + "unit": "ns", + "extra": "gctime=0\nmemory=75240\nallocs=1517\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + } + ] } ] }