Add back the unit tests from #2405.

NVIDIA · Jul 26, 2024 · f5d57fe · f5d57fe
1 parent 8bb3067
commit f5d57fe
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 14 deletions.
diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp
@@ -1269,24 +1269,15 @@ TEST_F(AliasTest, ReuseBuffer_AliasAcrossSegments) {
   at::Tensor t1 = at::randn({65}, options);
   at::Tensor t2 = at::randn({128, 65}, options);
 
-  FusionExecutorCache executor_cache(std::move(fusion));
+  FusionExecutorCache fec(std::move(fusion));
   // Make a copy of `t0` because `t0` will be in-place updated.
   at::Tensor original_t0 = t0.clone();
-  std::vector<at::Tensor> outputs =
-      executor_cache.runFusionWithInputs({t0, t1, t2});
+  std::vector<at::Tensor> outputs = fec.runFusionWithInputs({t0, t1, t2});
   testValidate(
-      executor_cache.fusion(),
-      outputs,
-      {original_t0, t1, t2},
-      __LINE__,
-      __FILE__);
+      fec.fusion(), outputs, {original_t0, t1, t2}, __LINE__, __FILE__);
 
   EXPECT_EQ(
-      executor_cache.getMostRecentKernelRuntime()
-          ->fusionSegments()
-          ->groups()
-          .size(),
-      2)
+      fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 2)
       << "segmentation didn't happen as expected";
 
   auto t3 = original_t0.add(1.0);
@@ -1635,4 +1626,39 @@ TEST_F(AliasTest, QKVSplitBackprop) {
   EXPECT_TRUE(out_tensors[2].is_alias_of(out_tensors[1]));
 }
 
+TEST_F(AliasTest, Bookend_Issue2375) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  constexpr int64_t n = 2, c = 128, h = 16, w = 16, g = 32;
+  const DataType dtype = DataType::Half;
+  const std::vector<int64_t> input_shape = {n, g, c / g, h, w};
+  const std::vector<int64_t> output_shape = {n, c, h, w};
+
+  auto tv0 = makeContigTensor(input_shape.size(), dtype);
+  fusion->addInput(tv0);
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {-1, -2, -3});
+  auto tv4 = broadcast(tv3, {false, false, true, true, true});
+  auto tv5 = div(tv2, tv4);
+  auto tv6 = castOp(dtype, tv5);
+  auto tv7 = reshape(tv6, input_shape, output_shape);
+  fusion->addOutput(tv7);
+
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto t0 = at::randn(input_shape, options);
+
+  FusionExecutorCache fec(std::move(fusion));
+  auto out_tensors = fec.runFusionWithInputs({t0});
+  testValidate(fec.fusion(), out_tensors, {t0}, __LINE__, __FILE__);
+
+  EXPECT_THAT(
+      fec.getMostRecentKernelRuntime()->fusionSegments()->groups(),
+      UnorderedElementsAre(
+          HeuristicIs(ScheduleHeuristic::NoOp),
+          HeuristicIs(ScheduleHeuristic::InnerPersistent)));
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp
@@ -49,7 +49,7 @@
 
 namespace nvfuser {
 
-using namespace at::indexing;
+using testing::UnorderedElementsAre;
 
 using GpuViewTest = NVFuserTest;
 
@@ -2380,6 +2380,63 @@ TEST_F(GpuViewTest, SplitMergePointwiseSplitMerge) {
   testValidate(executor_cache.fusion(), {cg_outputs}, {t0}, __LINE__, __FILE__);
 }
 
+// segmented into 2 kernels: pointwise and reduction
+TEST_F(GpuViewTest, GroupNormOriginal) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
+  const std::vector<int64_t> input_shape = {N, C, H, W};
+  const std::vector<int64_t> group_shape = {N, G, C / G, H, W};
+  const std::vector<int64_t> input_shape_wb = {C};
+  const std::vector<int64_t> group_shape_wb = {G, C / G};
+  DataType dtype = DataType::Half;
+  auto tv0 = makeContigTensor(input_shape.size(), dtype);
+  auto tv1 = makeContigTensor(input_shape_wb.size(), DataType::Float);
+  auto tv2 = makeContigTensor(input_shape_wb.size(), DataType::Float);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+  // pointwise ops, e.g. cast
+  auto tv3 = castOp(DataType::Float, tv0);
+  // reshape from {N, C, H, W} to {N, G, C / G, H, W}
+  auto tv4 = reshape(tv3, input_shape, group_shape);
+  // normalization
+  auto tv5 = sum(tv4, {-1, -2, -3});
+  auto tv6 = broadcast(tv5, {false, false, true, true, true});
+  auto tv7 = div(tv4, tv6);
+  // reshape back to {N, C, H, W}
+  auto tv8 = reshape(tv7, group_shape, input_shape);
+  // pointwise ops, e.g. scale, bias, cast
+  auto tv9 = broadcast(tv1, {true, false, true, true});
+  auto tv10 = broadcast(tv2, {true, false, true, true});
+  auto tv11 = mul(tv8, tv9);
+  auto tv12 = add(tv11, tv10);
+  auto tv13 = castOp(dtype, tv12);
+  fusion->addOutput(tv13);
+
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto options_wb = at::TensorOptions()
+                        .dtype(data_type_to_aten(DataType::Float))
+                        .device(at::kCUDA, 0);
+  auto t0 = at::randn(input_shape, options);
+  auto tw = at::randn(input_shape_wb, options_wb);
+  auto tb = at::randn(input_shape_wb, options_wb);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, tw, tb});
+  // should expect 1 after adding a pre-segment pass to move reshape to input
+  // and output.
+  EXPECT_THAT(
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(),
+      UnorderedElementsAre(
+          HeuristicIs(ScheduleHeuristic::PointWise),
+          HeuristicIs(ScheduleHeuristic::Reduction)));
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0, tw, tb}, __LINE__, __FILE__);
+}
+
 using ReductionAxes = std::vector<int64_t>;
 class ViewReductionTest : public NVFuserFixtureParamTest<ReductionAxes> {};