Skip to content

Commit

Permalink
avoid treating pointer to bool as bool when handling kir::asm in code…
Browse files Browse the repository at this point in the history
…gen (#3274)

**Issue:**  #3273
The original failure happened in test
`DistributedTransformerTest.Backward/__bfloat` when shared memory
persistent is used with async copy (after
#3217, not merged yet). The reason
is pointer to bool was treated as bool when handling kir::asm in codegen

**Fix:** If pointer, return pointer type not the type it points to

**Results:** Added a unit test, error is fixed.
  • Loading branch information
liqiangxl authored Oct 26, 2024
1 parent 3118fdf commit ff75845
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 8 deletions.
28 changes: 20 additions & 8 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3077,6 +3077,15 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
indent();
}

auto getTypeOrIndexType = [](Val* value) {
if (auto ti = dynamic_cast<kir::TensorIndex*>(value)) {
if (isPointerType(ti->index()->dtype())) {
return ti->index()->dtype();
}
}
return value->dtype();
};

if (asm_->hasBooleanInput()) {
code_ << "\"{\\n\"\n";
int64_t boolean_counter = 0;
Expand All @@ -3085,14 +3094,16 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
&asm_->outputs(), &asm_->inputs()};
for (const auto* io : outputs_and_inputs) {
for (auto val : *io) {
if (val->dtype() == DataType::Bool) {
// don't treat pointer to bool as bool
auto val_dtype = getTypeOrIndexType(val);
if (val_dtype == DataType::Bool) {
indent() << "\" .reg .pred p" << boolean_counter << "; \\n\"\n";
indent() << "\" setp.ne.b32 p" << boolean_counter << ", %"
<< counter << ", 0;\\n\"\n";
boolean_counter++;
}
if (std::holds_alternative<ArrayType>(val->dtype().type)) {
counter += (int64_t)std::get<ArrayType>(val->dtype().type).size;
if (std::holds_alternative<ArrayType>(val_dtype.type)) {
counter += (int64_t)std::get<ArrayType>(val_dtype.type).size;
} else {
counter++;
}
Expand Down Expand Up @@ -3139,9 +3150,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
next_line();
}
first = false;
if (std::holds_alternative<ArrayType>(register_->dtype().type)) {
for (auto i : c10::irange(
std::get<ArrayType>(register_->dtype().type).size)) {
auto reg_dtype = getTypeOrIndexType(register_);
if (std::holds_alternative<ArrayType>(reg_dtype.type)) {
for (auto i :
c10::irange(std::get<ArrayType>(reg_dtype.type).size)) {
if (i > 0) {
next_line();
}
Expand All @@ -3151,11 +3163,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
}
} else {
code_ << "\"" << constraint << "\"(";
if (register_->dtype() == DataType::Bool) {
if (reg_dtype == DataType::Bool) {
code_ << "(uint32_t)(";
}
code_ << gen(register_);
if (register_->dtype() == DataType::Bool) {
if (reg_dtype == DataType::Bool) {
code_ << ")";
}
code_ << ")";
Expand Down
55 changes: 55 additions & 0 deletions tests/cpp/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8819,6 +8819,61 @@ TEST_F(NVFuserTest, RAWSync) {
"Producer is required to be in Global or Shared Memory based on parallelization strategy. RAW flags: (threadIdx.x)")));
}

// Test `DistributedTransformerTest.Backward/__bfloat` has bool type tensor
// if copied to shared memory using async copy, will trigger a bug as described
// in https://github.com/NVIDIA/Fuser/issues/3273
// This test checks pointer to bool is not treated as data type bool when
// generating PTX code for kir::Asm, e.g. async copy.
TEST_F(NVFuserTest, CpAsyncDataTypeBool) {
NVFUSER_TEST_CUDA_ARCH_GUARD(8, 0);
Fusion fusion;
FusionGuard fg(&fusion);
auto dtype = DataType::Bool;
int m = 33, n = 128;
auto tv0 = makeContigConcreteTensor({m, n}, dtype);
fusion.addInput(tv0);
auto tv1 = set(tv0);
tv1->setMemoryType(MemoryType::Shared);
tv1->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::CpAsync);
tv1->definition()->as<LoadStoreOp>()->setCacheOp(CacheOp::Unspecified);
auto tv2 = castOp(DataType::Int32, tv1);
fusion.addOutput(tv2);

for (auto tv : {tv0, tv1, tv2}) {
tv->split(1, 4);
}
for (auto tv : {tv0, tv1, tv2}) {
tv->axis(0)->parallelize(ParallelType::BIDx);
tv->axis(1)->parallelize(ParallelType::TIDx);
}
tv1->axis(2)->parallelize(ParallelType::Vectorize);

inlineMost();

// randn doesn't support bool, ones is used instead
auto at_dtype = data_type_to_aten(dtype);
auto options = at::TensorOptions().dtype(at_dtype).device(at::kCUDA, 0);
at::Tensor t0 = at::ones({m, n}, options);

// Expected asm code is:
// asm volatile(
// "{\n"
// " .reg .pred p0; \n"
// " setp.ne.b32 p0, %3, 0;\n"
// " cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
// "}\n"
// :
// :"r"((uint32_t)((toSmem(T1) + i0))),
// "l"(((T0.data + i0) + i1)),
// "n"(4LL),
// "r"((uint32_t)((!b3)))
// );
// If not correctly lowered, would trigger error in compile
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto cg_outputs = fe.runFusion({t0});
testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
}
// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser

0 comments on commit ff75845

Please sign in to comment.