diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp index 1afa860b0d2..5d040527646 100644 --- a/csrc/device_lower/validation.cpp +++ b/csrc/device_lower/validation.cpp @@ -554,9 +554,13 @@ class VectorizeValidator : public OptInDispatch { auto ldst = dynamic_cast(tv->definition()); bool is_ldmatrix_trans = ldst != nullptr && mma_utils::isLdMatrixTranspose(ldst); - if (!is_ldmatrix_trans) { + if (!is_ldmatrix_trans && name.compare("consumer") != 0) { // ldmatrix.trans is a hardware transpose instruction that can do // "vectorized" read from discontiguous memory + // We don't think allocation domain of consumer is used in allocation. We + // skip it in validation here. Note that this assert was hit for + // vectorized pad, because we do not propagate allocation domain for + // PadOp. See: https://github.com/NVIDIA/Fuser/pull/3439 NVF_CHECK( last_alloc_dim == vec_alloc_id, "Vectorized dim for ", diff --git a/tests/cpp/test_pointwise.cpp b/tests/cpp/test_pointwise.cpp index c4684adbb6e..bb1c6bd7bfb 100644 --- a/tests/cpp/test_pointwise.cpp +++ b/tests/cpp/test_pointwise.cpp @@ -730,4 +730,47 @@ INSTANTIATE_TEST_SUITE_P( ss << "_outer_unroll_" << std::get<2>(info.param); return sanitizeTestName(ss.str()); }); + +TEST_F(PointwiseTest, VectorizePadLoweringPermuted) { + // Pointwise scheduler applies permutation to restore contiguous memory access + // on reference TV. Vectorization validation requires vectorized operations to + // preserve the allocation domain of their inputs. This test checks that PadOp + // propagates the allocation domain properly. + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + // input is permuted + auto tv0 = TensorViewBuilder() + .shape({1024, 1024}) + .dtype(DataType::Float) + .contiguity(true) + .strideOrder({0, 1}) + .build(); + fusion.addInput(tv0); + auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L)}); + auto tv2 = relu(tv1); + fusion.addOutput(tv2); + // output is permuted + tv2->setAllocationDomain({tv2->axis(1), tv2->axis(0)}, true); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = + at::randn({1024 * 1024}, options).as_strided({1024, 1024}, {1, 1024}); + std::vector aten_inputs({t0}); + + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + // check that we vectorize 4 + bool found_vectorize = false; + for (auto id : fusion.outputs().at(0)->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 4); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} } // namespace nvfuser