From 77a8db70727e9b6b47e2fbf7b6c153cce439285e Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 7 Nov 2024 10:47:41 -0800
Subject: [PATCH] Redo #3326.

The change was accidentally reverted in
https://github.com/NVIDIA/Fuser/pull/3349.
---
 tests/cpp/test_allocation_domain.cpp | 144 ++++++++++++---------------
 1 file changed, 61 insertions(+), 83 deletions(-)
diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index aca1adea75a..759960c722e 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -29,8 +29,7 @@ using ::testing::ElementsAre;
 // A global->shared->global copy kernel, shared memory allocated transposed to
 // avoid bank conflict.
 TEST_F(AllocationDomainTest, TransposedIntermediate) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigConcreteTensor({32, 32});
@@ -59,16 +58,15 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) {
   at::Tensor t0 = at::randn({32, 32}, options);
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
-  auto cg_outputs = ke.run({t0});
+  ke.compileFusion(&fusion, {t0});
+  auto cg_outputs = ke.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 // A global->global copy kernel converting NCHW memory format into NHWC, with a
 // 4d allocation domain in output.
 TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -97,7 +95,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
   at::Tensor t0 = at::randn({n, c, h, w}, options);
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   auto cg_outputs = ke.run({t0});
 
@@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
 // A global->global copy kernel converting NCHW memory format into NHWC, with a
 // 1d allocation domain in output.
 TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -136,7 +133,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
   at::Tensor t0 = at::randn({n, c, h, w}, options);
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   auto cg_outputs = ke.run({t0});
 
@@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
 // A global->global copy kernel converting NCHW memory format into NHWC, with a
 // 2d allocation domain in output.
 TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -176,7 +172,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
   at::Tensor t0 = at::randn({n, c, h, w}, options);
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   auto cg_outputs = ke.run({t0});
 
@@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
 // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation
 // domain in fusion output.
 TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n1 = 31, n2 = 29, h = 64, w = 104, c = 21;
@@ -223,7 +218,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
   at::Tensor t0 = at::randn({n1, n2, h * w * c}, options);
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   auto cg_outputs = ke.run({t0});
 
@@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
 // output. The allocation domain is on both the producer and the consumer side
 // of the rFactor domain.
 TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n1 = 31, n2 = 29, h = 64, w = 104, c = 21;
@@ -283,7 +277,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
   at::Tensor t0 = at::randn({n1, n2, c * h * w}, options);
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   auto cg_outputs = ke.run({t0});
 
@@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
 // A global->global copy kernel where both inputs and outputs are NHWC memory
 // format
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -339,7 +332,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
 // A global->global copy kernel where both inputs are NHWC memory format. The
 // allocation domain view the input as a 1d tensor.
 TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -398,7 +390,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
 // A global->global copy kernel where both inputs are NHWC memory format. The
 // allocation domain of the output view the output as a 1d tensor.
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -454,7 +445,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
 // A global->global copy kernel where both inputs are NHWC memory format. The
 // allocation domain view both the input and the output as a 1d tensors.
 TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -515,7 +505,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
 // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and
 // view the output as a 2d tensor of shape [N*H*W*C/4, 4]
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -583,7 +572,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
 
 // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -648,7 +636,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
 
 // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -725,7 +712,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
 
 // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -790,7 +776,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
 // allocation tensor to be between rFactor domain and loop domain, which is not
 // the case for NHWC2d_To_NHWC2d
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -861,7 +846,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
 
 // Similar to NHWC4d_To_NHWC4d, but does a cacheFork
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -933,7 +917,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
 
 // Similar to NHWC2d_To_NHWC2d, but does a cacheFork
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -1023,7 +1006,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
+  ke.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
       [&]() { ke.run({t0_wrong_format}); },
@@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
 }
 
 TEST_F(AllocationDomainTest, VectorizationIssue902) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto& fusion = *fusion_ptr;
-  FusionGuard fg(fusion_ptr.get());
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   const std::vector<int64_t> shape({16, 16, 512, 64});
 
   auto tv0 = makeContigTensor(4);
-  fusion.addInput(tv0);
+  fusion->addInput(tv0);
 
   auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
+  fusion->addOutput(tv1);
 
-  std::vector<nvfuser::IterDomain*> aloc_domain;
-  aloc_domain.push_back(tv1->axis(0));
-  aloc_domain.push_back(tv1->axis(2));
-  aloc_domain.push_back(tv1->axis(3));
-  aloc_domain.push_back(tv1->axis(1));
-  tv1->setAllocationDomain(aloc_domain, true);
+  std::vector<nvfuser::IterDomain*> alloc_domain;
+  alloc_domain.push_back(tv1->axis(0));
+  alloc_domain.push_back(tv1->axis(2));
+  alloc_domain.push_back(tv1->axis(3));
+  alloc_domain.push_back(tv1->axis(1));
+  tv1->setAllocationDomain(alloc_domain, true);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   ASSERT_TRUE(cg_outputs[0].equal(t0));
@@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) {
 }
 
 TEST_F(AllocationDomainTest, ContiguityIssue1021) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion* fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   auto tv0 = TensorViewBuilder()
                  .ndims(2)
@@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8});
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto outputs = executor_cache.runFusionWithInputs({t0});
 
   auto t1 = t0.add(5.0);
-  testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(AllocationDomainTest, ContiguityForBroadcast) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion* fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   auto tv0 = TensorViewBuilder()
                  .ndims(2)
@@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3});
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto outputs = executor_cache.runFusionWithInputs({t0});
 
   auto t1 = t0.add(5.0);
-  testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion* fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   auto tv0 = TensorViewBuilder()
                  .ndims(3)
@@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8});
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto outputs = executor_cache.runFusionWithInputs({t0});
 
   auto t1 = t0.add(5.0);
-  testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 // Test that allocation domain can be used to vectorize overlapping tensors,
@@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
 // automatically supports all kinds of use cases, even those that we don't have
 // an active plan to support on).
 TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(3);
@@ -1226,8 +1204,8 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) {
       at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1});
 
   KernelExecutor ke;
-  ke.compile(fusion_ptr.get(), {t0});
-  auto cg_outputs = ke.run({t0});
+  ke.compileFusion(&fusion, {t0});
+  auto cg_outputs = ke.runFusion({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }