diff --git a/__tmp_kernel_none_f0_c0_r0_g0.cu b/__tmp_kernel_none_f0_c0_r0_g0.cu index 8aecce536f2..a07dbca87ff 100644 --- a/__tmp_kernel_none_f0_c0_r0_g0.cu +++ b/__tmp_kernel_none_f0_c0_r0_g0.cu @@ -11522,6 +11522,8 @@ __global__ void nvfuser_none_f0_c0_r0_g0(Tensor<__half, 3, 3> T0, Tensor<__half, "n"(1), "n"(1) ); + asm volatile("wgmma.commit_group.sync.aligned;\n"); + asm volatile("wgmma.wait_group.sync.aligned %0;\n"::"n"(0LL):"memory"); mbarrier::arrive(toSmem((&T7[((i31 % 4) + 4LL)]))); } } diff --git a/nsys-log.txt b/nsys-log.txt index a67c33f1a26..d75ba57a86e 100644 --- a/nsys-log.txt +++ b/nsys-log.txt @@ -1,213 +1,215 @@ -# $NVFUSER_DUMP="cuda_to_file" nsys nvprof ./bin/test_matmul --gtest_filter=HopperMatmulTest.HSH_NT_128BSwizzle + # $NVFUSER_EXTERNAL_SRC="__tmp_kernel_none_f0_c0_r0_g0.cu" nsys nvprof ./bin/test_matmul --gtest_filter=*HSH*Sw* WARNING: test_matmul and any of its children processes will be profiled. Collecting data... Running main() from /opt/pytorch/nvfuser/third_party/googletest/googletest/src/gtest_main.cc -Note: Google Test filter = HopperMatmulTest.HSH_NT_128BSwizzle -[==========] Running 1 test from 1 test suite. +Note: Google Test filter = *HSH*Sw* +[==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. -[----------] 1 test from HopperMatmulTest +[----------] 2 tests from HopperMatmulTest [ RUN ] HopperMatmulTest.HSH_NT_128BSwizzle -PRINTING: __tmp_kernel_none_f0_c0_r0_g0.cu -[ OK ] HopperMatmulTest.HSH_NT_128BSwizzle (1366 ms) -[----------] 1 test from HopperMatmulTest (1366 ms total) +--------> Compiling external CUDA code: __tmp_kernel_none_f0_c0_r0_g0.cu +[ OK ] HopperMatmulTest.HSH_NT_128BSwizzle (1337 ms) +[ RUN ] HopperMatmulTest.HSH_NT_128BSwizzle_NoBroadcasts +[ OK ] HopperMatmulTest.HSH_NT_128BSwizzle_NoBroadcasts (28 ms) +[----------] 2 tests from HopperMatmulTest (1365 ms total) [----------] Global test environment tear-down -[==========] 1 test from 1 test suite ran. (1367 ms total) -[ PASSED ] 1 test. -Generating '/tmp/nsys-report-bf35.qdstrm' -[1/7] [========================100%] report1.nsys-rep -[2/7] [========================100%] report1.sqlite +[==========] 2 tests from 1 test suite ran. (1365 ms total) +[ PASSED ] 2 tests. +Generating '/tmp/nsys-report-e88d.qdstrm' +[1/7] [========================100%] report2.nsys-rep +[2/7] [========================100%] report2.sqlite [3/7] Executing 'nvtx_sum' stats report Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Style Range -------- --------------- --------- ----------- ----------- --------- --------- ----------- ------- ------------------------------------------------------------------- - 33.8 346690728 1 346690728.0 346690728.0 346690728 346690728 0.0 PushPop :KernelExecutor::compileFusion + 32.7 343466183 1 343466183.0 343466183.0 343466183 343466183 0.0 PushPop :KernelExecutor::compileFusion - 30.5 313685389 1 313685389.0 313685389.0 313685389 313685389 0.0 PushPop :executor_utils::NVRTC + 29.7 311338794 1 311338794.0 311338794.0 311338794 311338794 0.0 PushPop :executor_utils::NVRTC - 30.5 313357655 1 313357655.0 313357655.0 313357655 313357655 0.0 PushPop :executor_utils::Nvrtc::CompileProgram + 29.6 310986030 1 310986030.0 310986030.0 310986030 310986030 0.0 PushPop :executor_utils::Nvrtc::CompileProgram - 1.1 11663565 1 11663565.0 11663565.0 11663565 11663565 0.0 PushPop :GpuLower::Lower::IndexLowering::getIndexedExprs - 0.9 9601713 1 9601713.0 9601713.0 9601713 9601713 0.0 PushPop :GpuLower::lower + 1.9 20457621 2 10228810.5 10228810.5 8952161 11505460 1805455.0 PushPop :GpuLower::Lower::IndexLowering::getIndexedExprs + 1.6 16899198 2 8449599.0 8449599.0 7291110 9608088 1638350.9 PushPop :GpuLower::lower - 0.6 5855399 1 5855399.0 5855399.0 5855399 5855399 0.0 PushPop :ConditionalFromPredicateModifier::ConditionalFromPredicateModifier - 0.4 4498029 1 4498029.0 4498029.0 4498029 4498029 0.0 PushPop :KernelExecutor::runFusion + 1.0 10171530 2 5085765.0 5085765.0 4604694 5566836 680337.1 PushPop :ConditionalFromPredicateModifier::ConditionalFromPredicateModifier + 0.4 4481599 1 4481599.0 4481599.0 4481599 4481599 0.0 PushPop :KernelExecutor::runFusion - 0.4 4266882 1 4266882.0 4266882.0 4266882 4266882 0.0 PushPop :fusion_executor::allocations::allocateOutputs - 0.4 4259360 1 4259360.0 4259360.0 4259360 4259360 0.0 PushPop :fusion_executor::allocations::allocateTensor - 0.1 1465079 1 1465079.0 1465079.0 1465079 1465079 0.0 PushPop :GpuLower::Lower::replaceSymbolicSizes + 0.4 4237254 1 4237254.0 4237254.0 4237254 4237254 0.0 PushPop :fusion_executor::allocations::allocateOutputs + 0.4 4229005 1 4229005.0 4229005.0 4229005 4229005 0.0 PushPop :fusion_executor::allocations::allocateOutput + 0.3 2653101 2 1326550.5 1326550.5 1196642 1456459 183718.4 PushPop :GpuLower::Lower::replaceSymbolicSizes - 0.1 1281591 2 640795.5 640795.5 608888 672703 45124.0 PushPop :Index::getCpAsyncBulkGmemIndex + 0.2 2534978 4 633744.5 630173.0 609878 664754 24990.5 PushPop :Index::getCpAsyncBulkGmemIndex - 0.1 721652 1 721652.0 721652.0 721652 721652 0.0 PushPop :generateCudaKernel + 0.1 1125986 2 562993.0 562993.0 539743 586243 32880.5 PushPop :GpuLower::Lower::UnswitchPredicate::get - 0.1 676934 7 96704.9 4875.0 453 598347 222062.7 PushPop :IrContainer clear + 0.1 1114181 2 557090.5 557090.5 533163 581018 33838.6 PushPop :GpuLower::Lower::UnswitchPredicate::openLoop + 0.1 1112078 2 556039.0 556039.0 532119 579959 33828.0 PushPop :GpuLower::Lower::UnswitchPredicate::openIte + 0.1 1108261 2 554130.5 554130.5 530260 578001 33758.0 PushPop :GpuLower::Lower::UnswitchPredicate::predicateOn + 0.1 1101083 14 78648.8 4082.0 304 674441 188049.8 PushPop :IrContainer clear - 0.1 649428 20 32471.4 28826.0 6961 57475 13460.2 PushPop :transform_replay.cpp::getMatchedLeafPosWithoutReplayCasP - 0.1 562594 1 562594.0 562594.0 562594 562594 0.0 PushPop :GpuLower::Lower::UnswitchPredicate::get + 0.1 1041587 31 33599.6 33208.0 6625 60624 15277.4 PushPop :transform_replay.cpp::getMatchedLeafPosWithoutReplayCasP + 0.1 928289 4 232072.3 233951.0 220559 239828 8631.6 PushPop :GpuLower::Lower::Index::getConsumerStridedIndices + 0.1 874656 2 437328.0 437328.0 411366 463290 36715.8 PushPop :GpuLower::Lower::Index::getReferenceRootPredicates + 0.1 866722 2 433361.0 433361.0 432637 434085 1023.9 PushPop :GpuLower::Lower::Index::getProducerStridedIndices + 0.1 712942 1 712942.0 712942.0 712942 712942 0.0 PushPop :generateCudaKernel - 0.1 555928 1 555928.0 555928.0 555928 555928 0.0 PushPop :GpuLower::Lower::UnswitchPredicate::openLoop - 0.1 554544 1 554544.0 554544.0 554544 554544 0.0 PushPop :GpuLower::Lower::UnswitchPredicate::openIte - 0.1 552128 1 552128.0 552128.0 552128 552128 0.0 PushPop :GpuLower::Lower::UnswitchPredicate::predicateOn - 0.0 474897 410 1158.3 536.5 233 23945 1879.5 PushPop :ExpressionEvaluator::evaluate + 0.1 678239 666 1018.4 489.0 237 30898 1738.6 PushPop :ExpressionEvaluator::evaluate - 0.0 466710 2 233355.0 233355.0 224148 242562 13020.7 PushPop :GpuLower::Lower::Index::getConsumerStridedIndices - 0.0 431526 1 431526.0 431526.0 431526 431526 0.0 PushPop :GpuLower::Lower::Index::getReferenceRootPredicates - 0.0 422979 1 422979.0 422979.0 422979 422979 0.0 PushPop :GpuLower::Lower::Index::getProducerStridedIndices - 0.0 325974 1 325974.0 325974.0 325974 325974 0.0 PushPop :reuseMemoryAllocations + 0.1 592159 10 59215.9 62651.0 43208 72813 11055.3 PushPop :GpuLower::Lower::updateIndexCompute - 0.0 301718 5 60343.6 65254.0 41861 75235 13530.5 PushPop :GpuLower::Lower::updateIndexCompute + 0.1 577720 2 288860.0 288860.0 257658 320062 44126.3 PushPop :reuseMemoryAllocations - 0.0 283658 5 56731.6 42030.0 34163 118951 35252.8 PushPop :TransformReplay::replayPasC + 0.0 494795 2 247397.5 247397.5 214646 280149 46317.6 PushPop :GpuLower::Lower::validateIr - 0.0 269600 1 269600.0 269600.0 269600 269600 0.0 PushPop :GpuLower::Lower::validateIr + 0.0 460877 2 230438.5 230438.5 199740 261137 43414.2 PushPop :GpuLower::Lower::validateIterDomainUse - 0.0 251157 1 251157.0 251157.0 251157 251157 0.0 PushPop :GpuLower::Lower::validateIterDomainUse + 0.0 452243 2 226121.5 226121.5 198711 253532 38764.3 PushPop :GpuLower::Lower::insertAllocations - 0.0 221892 1 221892.0 221892.0 221892 221892 0.0 PushPop :GpuLower::Lower::getGlobalConsumerIndex + 0.0 446439 2 223219.5 223219.5 218241 228198 7040.7 PushPop :GpuLower::Lower::getGlobalConsumerIndex - 0.0 221542 7 31648.9 26959.0 9633 53309 18265.3 PushPop :TransformReplay::replayCasP + 0.0 432446 14 30889.0 29784.0 7059 55488 17595.0 PushPop :TransformReplay::replayCasP - 0.0 212673 1 212673.0 212673.0 212673 212673 0.0 PushPop :IrContainer copy + 0.0 396414 6 66069.0 43606.5 32902 119150 40782.5 PushPop :TransformReplay::replayPasC - 0.0 208919 1 208919.0 208919.0 208919 208919 0.0 PushPop :GpuLower::Lower::insertAllocations + 0.0 318378 2 159189.0 159189.0 118653 199725 57326.6 PushPop :IrContainer copy - 0.0 145336 1 145336.0 145336.0 145336 145336 0.0 PushPop :Fusion copy + 0.0 271928 2 135964.0 135964.0 131193 140735 6747.2 PushPop :GpuLower::Lower::LoopNestGenerator::loweredExprs + 0.0 262254 2 131127.0 131127.0 119825 142429 15983.4 PushPop :Fusion copy - 0.0 140574 1 140574.0 140574.0 140574 140574 0.0 PushPop :executor_utils::NvrtcCreateProgram + 0.0 199177 2 99588.5 99588.5 73941 125236 36271.0 PushPop :GpuLower::Lower::insertGridSerializationSyncs + 0.0 187884 2 93942.0 93942.0 91438 96446 3541.2 PushPop :SyncMap::SyncMap - 0.0 137294 1 137294.0 137294.0 137294 137294 0.0 PushPop :KernelExecutor::initializeExecutorEntry + 0.0 154969 1 154969.0 154969.0 154969 154969 0.0 PushPop :executor_utils::NvrtcCreateProgram - 0.0 131736 1 131736.0 131736.0 131736 131736 0.0 PushPop :GpuLower::Lower::LoopNestGenerator::loweredExprs - 0.0 115538 1 115538.0 115538.0 115538 115538 0.0 PushPop :executor_utils::Nvrtc::LoadPTX + 0.0 144104 1 144104.0 144104.0 144104 144104 0.0 PushPop :KernelExecutor::initializeExecutorEntry - 0.0 93821 1 93821.0 93821.0 93821 93821 0.0 PushPop :SyncMap::SyncMap + 0.0 139616 2 69808.0 69808.0 295 139321 98306.2 PushPop :GpuLower::Lower::insertMagicZero - 0.0 82446 6 13741.0 9194.0 4811 41149 13565.9 PushPop :Fusion::resetTvUses + 0.0 136901 11 12445.5 8678.0 5218 39605 10016.5 PushPop :Fusion::resetTvUses - 0.0 82075 1 82075.0 82075.0 82075 82075 0.0 PushPop :PrecomputedValues::PrecomputedValues + 0.0 123774 2 61887.0 61887.0 58381 65393 4958.2 PushPop :GpuLower::Lower::ThreadPredicateMap - 0.0 72991 1 72991.0 72991.0 72991 72991 0.0 PushPop :GpuLower::Lower::insertGridSerializationSyncs - 0.0 66913 1 66913.0 66913.0 66913 66913 0.0 PushPop :GpuLower::Lower::insertWarAsyncWait + 0.0 122096 1 122096.0 122096.0 122096 122096 0.0 PushPop :executor_utils::Nvrtc::LoadPTX - 0.0 65913 4 16478.3 13254.0 12330 27075 7108.3 PushPop :transform_replay.cpp::getMatchedLeafPosWithoutReplayPasC - 0.0 64205 1 64205.0 64205.0 64205 64205 0.0 PushPop :GpuLower::Lower::ThreadPredicateMap + 0.0 114327 2 57163.5 57163.5 55852 58475 1854.7 PushPop :GpuLower::Lower::validateVectorize - 0.0 62985 1 62985.0 62985.0 62985 62985 0.0 PushPop :KernelExecutor::computeArgs + 0.0 105938 2 52969.0 52969.0 40345 65593 17853.0 PushPop :GpuLower::Lower::insertWarAsyncWait - 0.0 61459 7 8779.9 5682.0 193 25171 9143.0 PushPop :getKernelArgument + 0.0 104082 2 52041.0 52041.0 48173 55909 5470.2 PushPop :GpuLower::Lower::UnrollPass::runPass - 0.0 59083 1 59083.0 59083.0 59083 59083 0.0 PushPop :GpuLower::Lower::validateVectorize + 0.0 91495 136 672.8 293.5 145 13705 1941.3 PushPop :getTerminatingOutputs - 0.0 52447 2 26223.5 26223.5 18958 33489 10275.0 PushPop :KernelExecutor::computeLaunchParams + 0.0 83563 1 83563.0 83563.0 83563 83563 0.0 PushPop :PrecomputedValues::PrecomputedValues - 0.0 49045 70 700.6 340.0 146 11229 1773.2 PushPop :getTerminatingOutputs + 0.0 77295 2 38647.5 38647.5 36253 41042 3386.3 PushPop :GpuLower::Lower::insertRawThreadSynchronization + 0.0 69627 1 69627.0 69627.0 69627 69627 0.0 PushPop :KernelExecutor::computeArgs - 0.0 47589 1 47589.0 47589.0 47589 47589 0.0 PushPop :GpuLower::Lower::UnrollPass::runPass + 0.0 67771 7 9681.6 6302.0 230 32563 11309.6 PushPop :getKernelArgument - 0.0 38321 1 38321.0 38321.0 38321 38321 0.0 PushPop :GpuLower::Lower::insertRawThreadSynchronization - 0.0 28726 1 28726.0 28726.0 28726 28726 0.0 PushPop :Kernel::analyze + 0.0 63331 4 15832.8 13954.5 12056 23366 5101.3 PushPop :transform_replay.cpp::getMatchedLeafPosWithoutReplayPasC + 0.0 60078 2 30039.0 30039.0 29511 30567 746.7 PushPop :Kernel::analyze - 0.0 26032 1 26032.0 26032.0 26032 26032 0.0 PushPop :KernelExecutor::supported + 0.0 52046 2 26023.0 26023.0 17902 34144 11484.8 PushPop :KernelExecutor::computeLaunchParams - 0.0 22062 1 22062.0 22062.0 22062 22062 0.0 PushPop :PrecomputedValues::bindInputs + 0.0 37945 2 18972.5 18972.5 18798 19147 246.8 PushPop :GpuLower::Lower::insertWarThreadSynchronization + 0.0 34591 2 17295.5 17295.5 16185 18406 1570.5 PushPop :GpuLower::Lower::vectorizeWelford - 0.0 20239 8 2529.9 3005.0 699 3766 1211.4 PushPop :KernelExecutor::ParallelBindingResolution - 0.0 18622 1 18622.0 18622.0 18622 18622 0.0 PushPop :GpuLower::Lower::insertWarThreadSynchronization - 0.0 17332 1 17332.0 17332.0 17332 17332 0.0 PushPop :GpuLower::Lower::processMisalignedVectorization - 0.0 16848 1 16848.0 16848.0 16848 16848 0.0 PushPop :GpuLower::Lower::vectorizeWelford + 0.0 32710 2 16355.0 16355.0 14733 17977 2293.9 PushPop :GpuLower::Lower::processMisalignedVectorization + 0.0 31231 2 15615.5 15615.5 14057 17174 2204.1 PushPop :GpuLower::Lower::MisalignedVectorizationModifier + 0.0 26313 1 26313.0 26313.0 26313 26313 0.0 PushPop :KernelExecutor::supported - 0.0 16278 1 16278.0 16278.0 16278 16278 0.0 PushPop :GpuLower::Lower::MisalignedVectorizationModifier - 0.0 14582 2 7291.0 7291.0 3400 11182 5502.7 PushPop :executor_utils::bindInputs + 0.0 24134 1 24134.0 24134.0 24134 24134 0.0 PushPop :PrecomputedValues::bindInputs - 0.0 12959 1 12959.0 12959.0 12959 12959 0.0 PushPop :KernelExecutor::runFusion::execute_kernel - 0.0 9355 1 9355.0 9355.0 9355 9355 0.0 PushPop :ExecutorRunFusion::cuLaunchKernel + 0.0 19110 8 2388.8 2784.5 736 3925 1180.6 PushPop :KernelExecutor::ParallelBindingResolution + 0.0 14907 10 1490.7 1142.5 789 2751 742.9 PushPop :GpuLower::Lower::ThreadPredicateMap::updateBitSet + 0.0 13881 2 6940.5 6940.5 3439 10442 4951.9 PushPop :executor_utils::bindInputs - 0.0 7470 5 1494.0 1195.0 907 2535 680.7 PushPop :GpuLower::Lower::ThreadPredicateMap::updateBitSet - 0.0 5905 2 2952.5 2952.5 835 5070 2994.6 PushPop :PrecomputedValues::Evaluate + 0.0 12801 1 12801.0 12801.0 12801 12801 0.0 PushPop :KernelExecutor::runFusion::execute_kernel + 0.0 8843 1 8843.0 8843.0 8843 8843 0.0 PushPop :ExecutorRunFusion::cuLaunchKernel - 0.0 4285 1 4285.0 4285.0 4285 4285 0.0 PushPop :fusion_executor::allocations::getOutbufferInfo - 0.0 3910 12 325.8 275.5 187 710 141.0 PushPop :GpuLower::Lower::IndexCompute::IndexCompute - 0.0 3487 1 3487.0 3487.0 3487 3487 0.0 PushPop :fusion_executor::allocations::getBufferInfo - 0.0 3042 2 1521.0 1521.0 368 2674 1630.6 PushPop :PrecomputedValuess::Validate + 0.0 8121 24 338.4 326.5 163 733 143.5 PushPop :GpuLower::Lower::IndexCompute::IndexCompute + 0.0 4893 1 4893.0 4893.0 4893 4893 0.0 PushPop :fusion_executor::allocations::getOutbufferInfo + 0.0 4835 2 2417.5 2417.5 2346 2489 101.1 PushPop :fusion_executor::allocations::computeSharedMemory + 0.0 4621 2 2310.5 2310.5 929 3692 1953.7 PushPop :PrecomputedValues::Evaluate - 0.0 2939 2 1469.5 1469.5 1387 1552 116.7 PushPop :fusion_executor::allocations::computeSharedMemory - 0.0 2372 1 2372.0 2372.0 2372 2372 0.0 PushPop :KernelExecutor::validateVectorizedTensors - 0.0 2280 1 2280.0 2280.0 2280 2280 0.0 PushPop :KernelExecutor::recomputeArgs + 0.0 4327 8 540.9 488.0 280 1012 246.1 PushPop :GpuLower::Lower::getInlinePredicate - 0.0 2218 1 2218.0 2218.0 2218 2218 0.0 PushPop :fusion_executor::allocations::inferShapeOfOutput - 0.0 1812 1 1812.0 1812.0 1812 1812 0.0 PushPop :executor_utils::NvrtcDestroyProgram + 0.0 3938 1 3938.0 3938.0 3938 3938 0.0 PushPop :fusion_executor::allocations::getBufferInfo + 0.0 3036 1 3036.0 3036.0 3036 3036 0.0 PushPop :KernelExecutor::validateVectorizedTensors + 0.0 2753 1 2753.0 2753.0 2753 2753 0.0 PushPop :executor_utils::NvrtcDestroyProgram - 0.0 1515 3 505.0 503.0 408 604 98.0 PushPop :GpuLower::Lower::getInlinePredicate + 0.0 2575 1 2575.0 2575.0 2575 2575 0.0 PushPop :fusion_executor::allocations::inferShapeOfOutput + 0.0 2453 1 2453.0 2453.0 2453 2453 0.0 PushPop :KernelExecutor::recomputeArgs - 0.0 579 1 579.0 579.0 579 579 0.0 PushPop :KernelExecutor::runFusion::intermediates + 0.0 1885 2 942.5 942.5 452 1433 693.7 PushPop :PrecomputedValuess::Validate - 0.0 530 1 530.0 530.0 530 530 0.0 PushPop :fusion_executor::allocations::inferShape + 0.0 676 1 676.0 676.0 676 676 0.0 PushPop :fusion_executor::allocations::inferShape - 0.0 520 1 520.0 520.0 520 520 0.0 PushPop :GpuLower::Lower::insertMagicZero + 0.0 645 1 645.0 645.0 645 645 0.0 PushPop :KernelExecutor::runFusion::intermediates - 0.0 371 1 371.0 371.0 371 371 0.0 PushPop :KernelExecutor::getIntermediateBufferInfo + 0.0 265 1 265.0 265.0 265 265 0.0 PushPop :KernelExecutor::getIntermediateBufferInfo [4/7] Executing 'cuda_api_sum' stats report Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- --------- --------- -------- -------- ----------- ------------------------------------------------ - 66.6 60848371 17 3579315.9 1358264.0 3404 12968043 4456513.8 cudaLaunchKernel - 26.5 24224471 15 1614964.7 1135046.0 23751 3518519 1143699.1 cuLibraryLoadData - 3.1 2818835 1 2818835.0 2818835.0 2818835 2818835 0.0 cudaGetDeviceProperties_v2_v12000 - 1.3 1152285 1 1152285.0 1152285.0 1152285 1152285 0.0 cudaFree - 1.3 1148664 10 114866.4 129926.5 2424 170853 50599.6 cudaMalloc - 0.8 761842 1 761842.0 761842.0 761842 761842 0.0 cudaHostAlloc - 0.1 131534 838 157.0 109.0 54 869 103.4 cuGetProcAddress_v2 - 0.1 85445 1 85445.0 85445.0 85445 85445 0.0 cuModuleLoadDataEx - 0.1 69924 1 69924.0 69924.0 69924 69924 0.0 cuLaunchKernelEx - 0.1 56223 1 56223.0 56223.0 56223 56223 0.0 cuModuleUnload - 0.0 42609 15 2840.6 3359.0 337 6672 1733.9 cuLibraryGetKernel - 0.0 17938 11 1630.7 1189.0 197 7041 1932.6 cudaStreamIsCapturing_v10000 - 0.0 13875 1 13875.0 13875.0 13875 13875 0.0 cudaMemcpyAsync - 0.0 8862 1 8862.0 8862.0 8862 8862 0.0 cudaStreamSynchronize - 0.0 7985 18 443.6 253.0 181 1624 459.7 cudaEventCreateWithFlags - 0.0 7060 18 392.2 302.0 284 1768 344.6 cudaOccupancyMaxActiveClusters_v11070 - 0.0 6934 1 6934.0 6934.0 6934 6934 0.0 cudaMemsetAsync - 0.0 6317 1 6317.0 6317.0 6317 6317 0.0 cuLaunchKernel - 0.0 4201 3 1400.3 1292.0 1058 1851 407.4 cuInit - 0.0 1632 1 1632.0 1632.0 1632 1632 0.0 cudaOccupancyAvailableDynamicSMemPerBlock_v10200 - 0.0 1457 2 728.5 728.5 100 1357 888.8 cuTensorMapEncodeTiled - 0.0 1457 2 728.5 728.5 261 1196 661.1 cuKernelSetAttribute - 0.0 1413 3 471.0 109.0 86 1218 647.0 cuModuleGetLoadingMode - 0.0 862 2 431.0 431.0 341 521 127.3 cudaGetDriverEntryPoint_v11030 - 0.0 482 2 241.0 241.0 108 374 188.1 cuKernelGetAttribute + 66.4 58554397 17 3444376.3 1283744.0 3659 12453354 4284796.7 cudaLaunchKernel + 26.4 23317496 15 1554499.7 1117762.0 24213 3296505 1072246.5 cuLibraryLoadData + 3.1 2775075 1 2775075.0 2775075.0 2775075 2775075 0.0 cudaGetDeviceProperties_v2_v12000 + 1.4 1252978 1 1252978.0 1252978.0 1252978 1252978 0.0 cudaFree + 1.2 1098580 10 109858.0 127937.0 2202 166283 48955.1 cudaMalloc + 0.8 718882 1 718882.0 718882.0 718882 718882 0.0 cudaHostAlloc + 0.2 144198 838 172.1 121.0 56 779 111.8 cuGetProcAddress_v2 + 0.1 87046 1 87046.0 87046.0 87046 87046 0.0 cuModuleLoadDataEx + 0.1 68550 1 68550.0 68550.0 68550 68550 0.0 cuLaunchKernelEx + 0.1 57299 15 3819.9 4573.0 319 8646 2383.1 cuLibraryGetKernel + 0.1 47273 1 47273.0 47273.0 47273 47273 0.0 cuModuleUnload + 0.0 21030 19 1106.8 458.0 188 7121 1618.8 cudaStreamIsCapturing_v10000 + 0.0 13711 1 13711.0 13711.0 13711 13711 0.0 cudaMemcpyAsync + 0.0 9193 1 9193.0 9193.0 9193 9193 0.0 cudaStreamSynchronize + 0.0 7214 18 400.8 302.5 291 1948 386.8 cudaOccupancyMaxActiveClusters_v11070 + 0.0 6888 1 6888.0 6888.0 6888 6888 0.0 cudaMemsetAsync + 0.0 6299 18 349.9 267.0 188 1823 370.7 cudaEventCreateWithFlags + 0.0 5850 1 5850.0 5850.0 5850 5850 0.0 cuLaunchKernel + 0.0 3416 3 1138.7 1090.0 840 1486 325.7 cuInit + 0.0 1882 1 1882.0 1882.0 1882 1882 0.0 cudaOccupancyAvailableDynamicSMemPerBlock_v10200 + 0.0 1447 2 723.5 723.5 125 1322 846.4 cuTensorMapEncodeTiled + 0.0 1314 2 657.0 657.0 164 1150 697.2 cuKernelSetAttribute + 0.0 1163 3 387.7 111.0 91 961 496.6 cuModuleGetLoadingMode + 0.0 1038 2 519.0 519.0 285 753 330.9 cudaGetDriverEntryPoint_v11030 + 0.0 511 2 255.5 255.5 76 435 253.9 cuKernelGetAttribute [5/7] Executing 'cuda_gpu_kern_sum' stats report Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- -------- -------- -------- -------- ----------- ---------------------------------------------------------------------------------------------------- - 33.8 135999 1 135999.0 135999.0 135999 135999 0.0 ::nvfuser_none_f0_c0_r0_g0(::Tensor<::__half, (int)3, (int)3>, … - 22.6 91103 1 91103.0 91103.0 91103 91103 0.0 nvjet_hsh_128x256_64x4_2x1_v_bz_coopA_NTN - - 20.1 80672 2 40336.0 40336.0 40256 40416 113.1 void at::native::::distribution_elementwise_grid_stride_kernel, std::arra… - 3.3 13472 2 6736.0 6736.0 6656 6816 113.1 void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor, std:… - 2.1 8512 1 8512.0 8512.0 8512 8512 0.0 void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp::CompareFunctor… - 1.6 6528 1 6528.0 6528.0 6528 6528 0.0 void at::native::vectorized_elementwise_kernel<(int)4, at::native::AUnaryFunctor, std::arr… - 1.0 4128 1 4128.0 4128.0 4128 4128 0.0 void at::native::vectorized_elementwise_kernel<(int)4, at::native::AUnaryFunctor::nvfuser_none_f0_c0_r0_g0(::Tensor<::__half, (int)3, (int)3>, … + 22.5 90239 1 90239.0 90239.0 90239 90239 0.0 nvjet_hsh_128x256_64x4_2x1_v_bz_coopA_NTN + + 20.1 80543 2 40271.5 40271.5 40223 40320 68.6 void at::native::::distribution_elementwise_grid_stride_kernel, std::arra… + 3.3 13184 2 6592.0 6592.0 6432 6752 226.3 void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor, std:… + 2.2 8736 1 8736.0 8736.0 8736 8736 0.0 void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp::CompareFunctor… + 1.6 6464 1 6464.0 6464.0 6464 6464 0.0 void at::native::vectorized_elementwise_kernel<(int)4, at::native::AUnaryFunctor, std::arr… [6/7] Executing 'cuda_gpu_mem_time_sum' stats report Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation -------- --------------- ----- -------- -------- -------- -------- ----------- ---------------------------- - 72.6 2880 1 2880.0 2880.0 2880 2880 0.0 [CUDA memcpy Device-to-Host] - 27.4 1088 1 1088.0 1088.0 1088 1088 0.0 [CUDA memset] + 73.8 2976 1 2976.0 2976.0 2976 2976 0.0 [CUDA memcpy Device-to-Host] + 26.2 1056 1 1056.0 1056.0 1056 1056 0.0 [CUDA memset] [7/7] Executing 'cuda_gpu_mem_size_sum' stats report @@ -217,5 +219,5 @@ Generating '/tmp/nsys-report-bf35.qdstrm' 0.000 1 0.000 0.000 0.000 0.000 0.000 [CUDA memcpy Device-to-Host] Generated: - /opt/pytorch/nvfuser/report1.nsys-rep - /opt/pytorch/nvfuser/report1.sqlite + /opt/pytorch/nvfuser/report2.nsys-rep + /opt/pytorch/nvfuser/report2.sqlite diff --git a/report2.nsys-rep b/report2.nsys-rep new file mode 100644 index 00000000000..a07083b50f5 Binary files /dev/null and b/report2.nsys-rep differ diff --git a/report2.sqlite b/report2.sqlite new file mode 100644 index 00000000000..bab69540a92 Binary files /dev/null and b/report2.sqlite differ