avoid treating pointer to bool as bool when handling kir::asm in code…

…gen (#3274) **Issue:** #3273 The original failure happened in test `DistributedTransformerTest.Backward/__bfloat` when shared memory persistent is used with async copy (after #3217, not merged yet). The reason is pointer to bool was treated as bool when handling kir::asm in codegen **Fix:** If pointer, return pointer type not the type it points to **Results:** Added a unit test, error is fixed.
NVIDIA · Oct 26, 2024 · ff75845 · ff75845
1 parent 3118fdf
commit ff75845
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 8 deletions.
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -3077,6 +3077,15 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
       indent();
     }
 
+    auto getTypeOrIndexType = [](Val* value) {
+      if (auto ti = dynamic_cast<kir::TensorIndex*>(value)) {
+        if (isPointerType(ti->index()->dtype())) {
+          return ti->index()->dtype();
+        }
+      }
+      return value->dtype();
+    };
+
     if (asm_->hasBooleanInput()) {
       code_ << "\"{\\n\"\n";
       int64_t boolean_counter = 0;
@@ -3085,14 +3094,16 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
           &asm_->outputs(), &asm_->inputs()};
       for (const auto* io : outputs_and_inputs) {
         for (auto val : *io) {
-          if (val->dtype() == DataType::Bool) {
+          // don't treat pointer to bool as bool
+          auto val_dtype = getTypeOrIndexType(val);
+          if (val_dtype == DataType::Bool) {
             indent() << "\"  .reg .pred p" << boolean_counter << "; \\n\"\n";
             indent() << "\"  setp.ne.b32 p" << boolean_counter << ", %"
                      << counter << ", 0;\\n\"\n";
             boolean_counter++;
           }
-          if (std::holds_alternative<ArrayType>(val->dtype().type)) {
-            counter += (int64_t)std::get<ArrayType>(val->dtype().type).size;
+          if (std::holds_alternative<ArrayType>(val_dtype.type)) {
+            counter += (int64_t)std::get<ArrayType>(val_dtype.type).size;
           } else {
             counter++;
           }
@@ -3139,9 +3150,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
               next_line();
             }
             first = false;
-            if (std::holds_alternative<ArrayType>(register_->dtype().type)) {
-              for (auto i : c10::irange(
-                       std::get<ArrayType>(register_->dtype().type).size)) {
+            auto reg_dtype = getTypeOrIndexType(register_);
+            if (std::holds_alternative<ArrayType>(reg_dtype.type)) {
+              for (auto i :
+                   c10::irange(std::get<ArrayType>(reg_dtype.type).size)) {
                 if (i > 0) {
                   next_line();
                 }
@@ -3151,11 +3163,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
               }
             } else {
               code_ << "\"" << constraint << "\"(";
-              if (register_->dtype() == DataType::Bool) {
+              if (reg_dtype == DataType::Bool) {
                 code_ << "(uint32_t)(";
               }
               code_ << gen(register_);
-              if (register_->dtype() == DataType::Bool) {
+              if (reg_dtype == DataType::Bool) {
                 code_ << ")";
               }
               code_ << ")";

diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
@@ -8819,6 +8819,61 @@ TEST_F(NVFuserTest, RAWSync) {
           "Producer is required to be in Global or Shared Memory based on parallelization strategy. RAW flags: (threadIdx.x)")));
 }
 
+// Test `DistributedTransformerTest.Backward/__bfloat` has bool type tensor
+// if copied to shared memory using async copy, will trigger a bug as described
+// in https://github.com/NVIDIA/Fuser/issues/3273
+// This test checks pointer to bool is not treated as data type bool when
+// generating PTX code for kir::Asm, e.g. async copy.
+TEST_F(NVFuserTest, CpAsyncDataTypeBool) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(8, 0);
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  auto dtype = DataType::Bool;
+  int m = 33, n = 128;
+  auto tv0 = makeContigConcreteTensor({m, n}, dtype);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::CpAsync);
+  tv1->definition()->as<LoadStoreOp>()->setCacheOp(CacheOp::Unspecified);
+  auto tv2 = castOp(DataType::Int32, tv1);
+  fusion.addOutput(tv2);
+
+  for (auto tv : {tv0, tv1, tv2}) {
+    tv->split(1, 4);
+  }
+  for (auto tv : {tv0, tv1, tv2}) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+  }
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  inlineMost();
+
+  // randn doesn't support bool, ones is used instead
+  auto at_dtype = data_type_to_aten(dtype);
+  auto options = at::TensorOptions().dtype(at_dtype).device(at::kCUDA, 0);
+  at::Tensor t0 = at::ones({m, n}, options);
+
+  // Expected asm code is:
+  // asm volatile(
+  //   "{\n"
+  //   "  .reg .pred p0; \n"
+  //   "  setp.ne.b32 p0, %3, 0;\n"
+  //   "  cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
+  //   "}\n"
+  //   :
+  //   :"r"((uint32_t)((toSmem(T1) + i0))),
+  //    "l"(((T0.data + i0) + i1)),
+  //    "n"(4LL),
+  //    "r"((uint32_t)((!b3)))
+  // );
+  // If not correctly lowered, would trigger error in compile
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
+}
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser