From 77a8db70727e9b6b47e2fbf7b6c153cce439285e Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 7 Nov 2024 10:47:41 -0800 Subject: [PATCH] Redo #3326. The change was accidentally reverted in https://github.com/NVIDIA/Fuser/pull/3349. --- tests/cpp/test_allocation_domain.cpp | 144 ++++++++++++--------------- 1 file changed, 61 insertions(+), 83 deletions(-) diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index aca1adea75a..759960c722e 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -29,8 +29,7 @@ using ::testing::ElementsAre; // A global->shared->global copy kernel, shared memory allocated transposed to // avoid bank conflict. TEST_F(AllocationDomainTest, TransposedIntermediate) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigConcreteTensor({32, 32}); @@ -59,16 +58,15 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) { at::Tensor t0 = at::randn({32, 32}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.run({t0}); + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); } // A global->global copy kernel converting NCHW memory format into NHWC, with a // 4d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -97,7 +95,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 1d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -136,7 +133,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) { // A global->global copy kernel converting NCHW memory format into NHWC, with a // 2d allocation domain in output. TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -176,7 +172,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { at::Tensor t0 = at::randn({n, c, h, w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) { // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation // domain in fusion output. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -223,7 +218,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { at::Tensor t0 = at::randn({n1, n2, h * w * c}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) { // output. The allocation domain is on both the producer and the consumer side // of the rFactor domain. TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n1 = 31, n2 = 29, h = 64, w = 104, c = 21; @@ -283,7 +277,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { at::Tensor t0 = at::randn({n1, n2, c * h * w}, options); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); auto cg_outputs = ke.run({t0}); @@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) { // A global->global copy kernel where both inputs and outputs are NHWC memory // format TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -339,7 +332,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view the input as a 1d tensor. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -398,7 +390,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain of the output view the output as a 1d tensor. TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -454,7 +445,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) { // A global->global copy kernel where both inputs are NHWC memory format. The // allocation domain view both the input and the output as a 1d tensors. TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -515,7 +505,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) { // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and // view the output as a 2d tensor of shape [N*H*W*C/4, 4] TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -583,7 +572,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) { // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -648,7 +636,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) { // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -725,7 +712,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) { // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -790,7 +776,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) { // allocation tensor to be between rFactor domain and loop domain, which is not // the case for NHWC2d_To_NHWC2d TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -861,7 +846,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) { // Similar to NHWC4d_To_NHWC4d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); @@ -933,7 +917,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) { // Similar to NHWC2d_To_NHWC2d, but does a cacheFork TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); int n = 31, h = 64, w = 103, c = 21; @@ -1023,7 +1006,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); + ke.compileFusion(&fusion, {t0}); EXPECT_THAT( [&]() { ke.run({t0_wrong_format}); }, @@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) { } TEST_F(AllocationDomainTest, VectorizationIssue902) { - auto fusion_ptr = std::make_unique(); - auto& fusion = *fusion_ptr; - FusionGuard fg(fusion_ptr.get()); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); const std::vector shape({16, 16, 512, 64}); auto tv0 = makeContigTensor(4); - fusion.addInput(tv0); + fusion->addInput(tv0); auto tv1 = set(tv0); - fusion.addOutput(tv1); + fusion->addOutput(tv1); - std::vector aloc_domain; - aloc_domain.push_back(tv1->axis(0)); - aloc_domain.push_back(tv1->axis(2)); - aloc_domain.push_back(tv1->axis(3)); - aloc_domain.push_back(tv1->axis(1)); - tv1->setAllocationDomain(aloc_domain, true); + std::vector alloc_domain; + alloc_domain.push_back(tv1->axis(0)); + alloc_domain.push_back(tv1->axis(2)); + alloc_domain.push_back(tv1->axis(3)); + alloc_domain.push_back(tv1->axis(1)); + tv1->setAllocationDomain(alloc_domain, true); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); ASSERT_TRUE(cg_outputs[0].equal(t0)); @@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) { } TEST_F(AllocationDomainTest, ContiguityIssue1021) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(2) @@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion* fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); auto tv0 = TensorViewBuilder() .ndims(3) @@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8}); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.add(5.0); - testValidate(fusion, outputs, {t0}, __LINE__, __FILE__); + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } // Test that allocation domain can be used to vectorize overlapping tensors, @@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) { // automatically supports all kinds of use cases, even those that we don't have // an active plan to support on). TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(3); @@ -1226,8 +1204,8 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) { at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1}); KernelExecutor ke; - ke.compile(fusion_ptr.get(), {t0}); - auto cg_outputs = ke.run({t0}); + ke.compileFusion(&fusion, {t0}); + auto cg_outputs = ke.runFusion({t0}); testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__); }