diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h index f32211a166d..260af932cda 100644 --- a/csrc/ir/interface_nodes.h +++ b/csrc/ir/interface_nodes.h @@ -482,10 +482,16 @@ class NVF_API TensorView : public Val { //! //! @param op_type: memory operator to use for the inserted op between //! the the data tensor and the cache tensor + //! @param cache_op: cache operator, see enum class CacheOp + //! @param propagate_allocation_domain: replay allocation domain on cached + //! load + //! @param cached_uses: if empty, cache all uses; otherwise, only try to cache + //! uses in cached_uses. TensorView* cacheAfter( LoadStoreOpType op_type = LoadStoreOpType::Set, CacheOp cache_op = CacheOp::Unspecified, - bool propagate_allocation_domain = true); + bool propagate_allocation_domain = true, + std::vector cached_uses = {}); // For a fusion output with other uses, we want to avoid writing to global // memory and then reading the output again. We write to global memory diff --git a/csrc/preseg_passes/move_pad.cpp b/csrc/preseg_passes/move_pad.cpp index bb277e62566..e2773ac6ea2 100644 --- a/csrc/preseg_passes/move_pad.cpp +++ b/csrc/preseg_passes/move_pad.cpp @@ -287,7 +287,10 @@ TensorView* replayConcretePad( auto* new_out = IrBuilder::create( IrBuilder::create( - merged_root_ids, merged_logical_ids, merged_logical_ids), + merged_root_ids, + merged_logical_ids, + merged_logical_ids, + TensorDomain::getContiguityFilledWith(merged_logical_ids, true)), pad_tv->getDataType().value()); IrBuilder::create( new_out, diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 260f5813be7..667ea7f40f2 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -1198,12 +1198,36 @@ std::vector cacheInputs(Fusion* fusion, bool unroll) { for (auto tv : in_tvs) { if (tv->uses().empty() || ir_utils::isTorchGatherLookupTv(tv) || ir_utils::isIndexSelectLookupTv(tv) || - ir_utils::isTvUsedByOpsOfType(tv)) { + ir_utils::isTvUsedByOpsOfType(tv)) { // Right now, tensors that are input to the slice, select, and pad ops // can't be cached as they must be in global memory. continue; } - auto cached_tv = tv->cacheAfter(); + + // TODO: might need to reverse this when scheduler handles pad directly + // Do not insert a cache for pad as vectorization needs to be + // done directly. + // + // Note that this means that if an input is padded and also is + // used without padding, it will be read twice, once for pad and + // once more for caching load. It would make sense to use the PTX + // caching load instructions. + std::vector cached_uses; + for (auto use : tv->uses()) { + if (!use->isA()) { + cached_uses.push_back(use); + } + } + + if (cached_uses.empty()) { + continue; + } + + auto cached_tv = tv->cacheAfter( + /*op_type=*/LoadStoreOpType::Set, + /*cache_op=*/CacheOp::Unspecified, + /*propagate_allocation_domain=*/true, + /*cached_uses=*/cached_uses); cached_inputs.emplace_back(cached_tv); } return cached_inputs; @@ -1290,12 +1314,7 @@ IterDomain* projectIdToRoot( } else if (expr->isA()) { auto resize = expr->as(); if (resize->out() == projected_id) { - // We do not allow vectorization with resize at this moment - if (vectorize_pass) { - projected_id = nullptr; - } else { - projected_id = resize->in(); - } + projected_id = resize->in(); } } else { NVF_THROW("Didn't recognize the iterdomain expression: ", expr); @@ -1350,12 +1369,7 @@ IterDomain* projectIdToRFactor( } else if (expr->isA()) { auto resize = expr->as(); if (resize->in() == projected_id) { - // We do not allow vectorization wit resize at this moment - if (vectorize_pass) { - projected_id = nullptr; - } else { - projected_id = resize->out(); - } + projected_id = resize->out(); } } else { NVF_THROW("Didn't recognize the iterdomain expression: ", expr); @@ -1549,12 +1563,6 @@ std::vector getInputsOutputsWithInnerDim( // scheduler prefer to use output instead of input as reference tensor. for (auto output_tv : ir_utils::filterByType(reference_tv->fusion()->outputs())) { - // At this moment, vectorization through resize is not - // supported. This is not required currently as we always insert - // cacheBefore, but just in case. - if (ir_utils::hasResizedRfactor(output_tv)) { - continue; - } if (hasInnerDim(output_tv, vectorizable_dims, vectorize_pass)) { vectorizable_tensors.push_back(output_tv); } @@ -1569,19 +1577,11 @@ std::vector getInputsOutputsWithInnerDim( continue; } - auto expr_resizes = [](Expr* e) -> bool { - return std::any_of( - e->outputs().begin(), e->outputs().end(), [](Val* out) -> bool { - if (auto* out_tv = dynamic_cast(out)) { - return ir_utils::hasResizedRfactor(out_tv); - } - return false; - }); - }; - - // At this moment, vectorization through resize is not supported - if (std::any_of( - input_tv->uses().begin(), input_tv->uses().end(), expr_resizes)) { + // Slice op is explicitly not enabled for vectorized load. + if (std::all_of( + input_tv->uses().begin(), + input_tv->uses().end(), + [](Expr* e) -> bool { return e->isA(); })) { continue; } @@ -2385,7 +2385,7 @@ bool revertUseOfInputCache( void prepareForMemoryTypePromotion(Fusion* fusion) { auto non_pwise_pairs = getNonPointwiseProducerConsumerPairs(fusion); - // Inserting a copy of each proucer. If a tensor shows up as a + // Inserting a copy of each producer. If a tensor shows up as a // producer for multiple consumers, only insert one // copy and share it with all the consumers. diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp index 26fa824ad36..3dcad2b497d 100644 --- a/csrc/scheduler/vectorize_helper.cpp +++ b/csrc/scheduler/vectorize_helper.cpp @@ -55,6 +55,30 @@ Val* ContiguousInnerDimensionsMapper::isFullyProjected(IterDomain* id) { getProjectedExtent(id), commonOrConstExtent(ca_map_, id)); } +void ContiguousInnerDimensionsMapper::initializeResizeInfo(Fusion* fusion) { + auto exprs = fusion->exprs(); + for (auto* pad_op : ir_utils::filterByType(exprs)) { + if (!pad_op->out()->isA()) { + continue; + } + + auto* out_tv = pad_op->out()->as(); + + auto consumer_exprs = StmtSort::getExprsBetween( + {out_tv->getMaybeRootDomain().begin(), + out_tv->getMaybeRootDomain().end()}, + {out_tv->getLogicalDomain().begin(), out_tv->getLogicalDomain().end()}); + + // NOTE: if we can assume that PadOp is always on inputs, then we can skip + // to innermost resize instead. + auto resize_ops = ir_utils::filterByType(consumer_exprs); + std::copy( + resize_ops.begin(), + resize_ops.end(), + std::inserter(resize_in_pad_, resize_in_pad_.end())); + } +} + ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper( TensorView* reference, const std::vector& ids, @@ -67,6 +91,9 @@ ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper( ca_map_(std::move(ca_map)), divisible_splits_(divisible_splits) { FusionGuard fg(reference->fusion()); + + initializeResizeInfo(reference->fusion()); + // Exclude reduction IDs if the reference is a fusion input as they // don't manifest at all in the fusion. This simplifies the // analysis in getContigMergeOfInnerSize, which only looks at @@ -365,9 +392,51 @@ std::vector ContiguousInnerDimensionsMapper::projectId( distributePE(merge_or_split); }; - auto clear_left_of = [&frontier](IterDomain* id) { - auto it = std::find(frontier.begin(), frontier.end(), id); - if (it != frontier.end()) { + auto propagateResize = [&frontier, this](Resize* resize_op, bool p2c) { + IterDomain* id_from = p2c ? resize_op->in() : resize_op->out(); + IterDomain* id_to = p2c ? resize_op->out() : resize_op->in(); + + auto it = std::find(frontier.begin(), frontier.end(), id_from); + if (it == frontier.end()) { + return; + } + + auto pos = std::distance(frontier.begin(), it); + if (resize_in_pad_.count(resize_op) != 0) { + // resize created by PadOp. + + // project resize op to frontier. + frontier[pos] = id_to; + // clear left of resize, since those are no long contiguous. + frontier.erase(frontier.begin(), it); + + if (recording_) { + // TODO: support negative resize extent. + // + // Limit current support to only positive resize extent for now. So we + // only consider the pad_extent, which becomes the real buffer on + // output. Hence we do GCD among padded extent as well as extent of the + // id_from. Note since we are taking the GCD here, I don't think using + // id_from or id_to makes a difference. + auto consumer_factor = getProjectedExtent(id_from); + auto comp = [](Val* factor, Val* extent) { + return SimplifyingIrBuilder::whereExpr( + SimplifyingIrBuilder::eqExpr( + extent, extent->container()->zeroVal()), + factor, + // for extent < 0, we'll take max(1, extent). Because of the gcd, + // This is effectively excluding the resize id from vectorization. + SimplifyingIrBuilder::gcdExpr( + factor, + SimplifyingIrBuilder::maxExpr( + extent->container()->oneVal(), extent))); + }; + consumer_factor = comp(consumer_factor, resize_op->leftExpand()); + consumer_factor = comp(consumer_factor, resize_op->rightExpand()); + addProjectedExtent(id_to, consumer_factor); + } + } else { + // unsupproted resize. frontier.erase(frontier.begin(), it + 1); } }; @@ -391,8 +460,7 @@ std::vector ContiguousInnerDimensionsMapper::projectId( } else if (Merge* merge = dynamic_cast(expr)) { propagateDistribute(merge); } else if (Resize* resize = dynamic_cast(expr)) { - // Cannot vectorize through resize - clear_left_of(resize->out()); + propagateResize(resize, false); } else { // TODO: I wonder if we should just remove all inputs instead of erroring. // Seems that would be safe. @@ -415,8 +483,7 @@ std::vector ContiguousInnerDimensionsMapper::projectId( } else if (Split* split = dynamic_cast(expr)) { propagateDistribute(split); } else if (Resize* resize = dynamic_cast(expr)) { - // Cannot vectorize through resize - clear_left_of(resize->in()); + propagateResize(resize, true); } else { // TODO: I wonder if we should just remove all inputs instead of erroring. // Seems that would be safe. diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h index 193df7786c9..d5c8e26c406 100644 --- a/csrc/scheduler/vectorize_helper.h +++ b/csrc/scheduler/vectorize_helper.h @@ -31,7 +31,7 @@ namespace vectorize_helper { // Projects IterDomains through the fusion starting at provided reference. IDs // in the reference are expected to be "contiguous", simply means dimensions -// that the iter domains are consecutive and next to eachother in the +// that the iter domains are consecutive and next to each other in the // reference. This property is not enforced, but mapping can have some // unpredictbale results if they are not. The reason we want contiguity here // is this class is primarily used for vectorization analysis. Domains may be @@ -78,7 +78,7 @@ namespace vectorize_helper { // tv1[2*3, 5, 7*11] = view(tv0) // with tv1 and [2*3, 7*11] as the reference and ids. tv0's 2 and 11 dim are // easily identified as being mapped. The 3*5*7 dimension however, is -// partially mapped on the left and right side. Since this class is intended to +// partially mapped on the left and right side. Since this class is intended to // line up "inner dimensions" of tensors through out the graph for the purpose // of unrolling and vectorization, it only tracks partial dimensions as they are // on the right hand side of iteration domains. For example in the last case we @@ -289,6 +289,9 @@ class NVF_API ContiguousInnerDimensionsMapper void propagateP2C(TensorView* from, TensorView* to) final; void propagateSibling(TensorView* from, TensorView* to) final; + // traverse fusion to mark the origin of Resize + void initializeResizeInfo(Fusion* fusion); + // Initialized to false, series of compute... calls will be performed to find // the spanning tree. Then propagate... calls will call the compute... calls. // recording_ starts as false, and stays that way during the first series of @@ -308,6 +311,9 @@ class NVF_API ContiguousInnerDimensionsMapper tv_infos_; std::unordered_map projected_extent_; + + //! stores all Resize* op that's added from PadOp* + std::unordered_set resize_in_pad_; }; // logical_reorder_map is provided to assume reference_tv will be reordered per diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp index b85c4947b1c..afab5f5a598 100644 --- a/csrc/tensor_view.cpp +++ b/csrc/tensor_view.cpp @@ -1169,15 +1169,37 @@ TensorView* TensorView::cacheFork() { TensorView* TensorView::cacheAfter( LoadStoreOpType op_type, CacheOp cache_op, - bool propagate_allocation_domain) { + bool propagate_allocation_domain, + std::vector cached_uses) { NVF_ERROR( !container()->isA(), "Function invalid for kernel container."); FusionGuard fg(fusion()); + if (!cached_uses.empty()) { + std::unordered_set unique_uses = fusion()->unordered_uses(this); + for (auto use : cached_uses) { + NVF_ERROR( + unique_uses.count(use), + "cached_uses is not among the use of the TensorView"); + } + } else { + // avoid non-determinism and ensure unique + std::unordered_set unique_uses; + auto this_uses = uses(); + cached_uses.reserve(this_uses.size()); + for (Expr* use : this_uses) { + NVF_ERROR( + unique_uses.count(use) == 0, + "detect duplicated entries in TensorView::uses()"); + cached_uses.push_back(use); + unique_uses.insert(use); + } + } + // Get all the uses for this Tensorview NVF_CHECK( - !uses().empty(), + !cached_uses.empty(), "Error adding cacheAfter ", this, " we restrict using cacheAfter on tensors that have no further uses."); @@ -1188,18 +1210,19 @@ TensorView* TensorView::cacheAfter( !hasComputeAt(), "Caching computed-at tensors is not allowed. Apply caching before computeAt."); - bool is_allowed_op = - !ir_utils::isTvUsedByOpsOfType(this) && - !ir_utils::isIndexSelectLookupTv(this); - NVF_CHECK( - is_allowed_op, - "Right now, caching tensors that are input to the select/slice/pad ops are not allowed as they must be in global memory.") + // disallow cache on operation where we require data remain in global memory. + for (auto use : cached_uses) { + NVF_ERROR( + !(use->isOneOf()) && + !(use->isA() && use->input(0) == this), + "Right now, caching tensors that are input to the select/slice/pad ops are not allowed as they must be in global memory."); + } // It also did additional transformation when this tensor is an // input and the outputs of its consumers have computeAt. Make sure // we no longer rely on that behavior. if (isFusionInput()) { - for (const auto& expr : uses()) { + for (const auto& expr : cached_uses) { for (TensorView* output : ir_utils::filterByType(expr->outputs())) { NVF_CHECK( @@ -1242,7 +1265,7 @@ TensorView* TensorView::cacheAfter( // After: This TV -> [Set Op] -> New CA TV -> [Use Op] -> Next TV // Expr* consumer_uses = - for (auto expr : fusion()->unordered_uses(this)) { + for (auto expr : cached_uses) { ir_utils::replaceValInExprInputs(expr, this, consumer); } diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index ed02e67ee5f..09558e05cf2 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -2660,7 +2660,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) { const int64_t slice_offset = 4; const std::vector shape({1024L * 1024L}); - // Using a concrete tensor to avoid dynamic reshape + // Using a concrete tensor to avoid dynamic resize auto tv0 = makeContigConcreteTensor(shape); fusion.addInput(tv0); @@ -3439,9 +3439,8 @@ TEST_F(ResizeTest, SqueezeSlicedExpand) { __FILE__); } -// Vectorization through resize is not supported yet. Make sure -// vectorization is disabled. -TEST_F(ResizeTest, AvoidVectorization) { +// Vectorization through pad is supported now! +TEST_F(ResizeTest, PadVectorization) { Fusion fusion; FusionGuard fg(&fusion); @@ -3471,9 +3470,8 @@ TEST_F(ResizeTest, AvoidVectorization) { // Make sure tv1 is not vectorized, i.e., no loop IterDomains are vectorized. EXPECT_THAT( tv1->getLoopDomain(), - Each( - Property(&IterDomain::getParallelType, Not(ParallelType::Vectorize)))) - << "Unexpected vectorization: " << tv1; + Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize))) + << "Failed to vectorize: " << tv1; // Make sure tv2 should be vectorized, i.e., at least one loop IterDomain is // vectorized. @@ -4093,7 +4091,6 @@ TEST_F(ResizeTest, VectorizeWhereLowering) { FusionGuard fg(fusion_ptr.get()); const std::vector shape({1024L * 1024L}); - // Note: nvfuser currently only supports vectorization with a single // TensorView input. auto s0 = IrBuilder::create(DataType::Bool); @@ -4123,4 +4120,302 @@ TEST_F(ResizeTest, VectorizeWhereLowering) { ASSERT_TRUE(t0.equal(cg_outputs[0])); } +TEST_F(ResizeTest, VectorizeFactorFour) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape({1024L * 1024L}); + + // Using a concrete tensor to avoid dynamic resize + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L)}); + fusion.addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + + // check that we vectorize 4 + bool found_vectorize = false; + auto exprs = fusion.exprs(); + auto pad_ops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(pad_ops.size(), 1); + EXPECT_TRUE(pad_ops.at(0)->out()->isA()); + for (auto id : pad_ops.at(0)->out()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 4); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +// This test is to check that the pad extent is used to limit the vectorization +// factor. +TEST_F(ResizeTest, VectorizeFactorTwo) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape({1024L * 1024L}); + + // Using a concrete tensor to avoid dynamic resize + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + // pad extent would restrict vectorization factor + auto tv1 = pad(tv0, {IrBuilder::create(2L), IrBuilder::create(2L)}); + fusion.addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + + // check that we vectorize 2 + bool found_vectorize = false; + auto exprs = fusion.exprs(); + auto pad_ops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(pad_ops.size(), 1); + EXPECT_TRUE(pad_ops.at(0)->out()->isA()); + for (auto id : pad_ops.at(0)->out()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 2); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +// This test is to check that the pad with 0-extent +TEST_F(ResizeTest, VectorizeFactorTwoPadZeroExtent) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape({1024L * 1024L}); + + // Using a concrete tensor to avoid dynamic resize + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + // pad extent would restrict vectorization factor + auto tv1 = pad(tv0, {IrBuilder::create(0L), IrBuilder::create(2L)}); + fusion.addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + + // check that we vectorize 2 + bool found_vectorize = false; + auto exprs = fusion.exprs(); + auto pad_ops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(pad_ops.size(), 1); + EXPECT_TRUE(pad_ops.at(0)->out()->isA()); + for (auto id : pad_ops.at(0)->out()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 2); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +TEST_F(ResizeTest, VectorizePadNonInnermost) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape({1024L, 1024L, 2L}); + + // Using a concrete tensor to avoid dynamic resize + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = + pad(tv0, + {IrBuilder::create(0L), + IrBuilder::create(0L), + IrBuilder::create(4L), + IrBuilder::create(4L), + IrBuilder::create(0L), + IrBuilder::create(0L)}); + fusion.addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + + // check that we vectorize 4 + bool found_vectorize = false; + auto exprs = fusion.exprs(); + auto pad_ops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(pad_ops.size(), 1); + EXPECT_TRUE(pad_ops.at(0)->out()->isA()); + for (auto id : pad_ops.at(0)->out()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 4); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +// padding with negative extent should prevent us considering the resize id for +// vectorization. So the example below should only have a vectorization factor +// of 2 +TEST_F(ResizeTest, VectorizePadNonInnermostNegativeExtent) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape({1024L, 1024L, 2L}); + + // Using a concrete tensor to avoid dynamic resize + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = + pad(tv0, + {IrBuilder::create(0L), + IrBuilder::create(0L), + IrBuilder::create(-4L), + IrBuilder::create(4L), + IrBuilder::create(0L), + IrBuilder::create(0L)}); + fusion.addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + + // check that we vectorize 4 + bool found_vectorize = false; + auto exprs = fusion.exprs(); + auto pad_ops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(pad_ops.size(), 1); + EXPECT_TRUE(pad_ops.at(0)->out()->isA()); + for (auto id : pad_ops.at(0)->out()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 2); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +TEST_F(ResizeTest, PadAndCacheUses) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape({1024L * 1024L}); + + // Using a concrete tensor to avoid dynamic resize + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L)}); + fusion.addOutput(tv1); + auto tv2 = relu(tv0); + fusion.addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + auto cg_outputs = + scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; + + // check that pad vectorize 4 + bool found_vectorize = false; + auto exprs = fusion.exprs(); + auto pad_ops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(pad_ops.size(), 1); + EXPECT_TRUE(pad_ops.at(0)->out()->isA()); + for (auto id : pad_ops.at(0)->out()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 4); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + // check that relu vectorize 4 + found_vectorize = false; + auto uops = ir_utils::filterByType(exprs).vector(); + EXPECT_EQ(uops.size(), 1); + EXPECT_TRUE(uops.at(0)->in()->isA()); + for (auto id : uops.at(0)->in()->as()->getLoopDomain()) { + if (id->getParallelType() == ParallelType::Vectorize) { + EXPECT_EQ(id->extent()->evaluate(), 4); + found_vectorize = true; + break; + } + } + EXPECT_TRUE(found_vectorize); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +// we cannot yet test this one, as pad in the middle causes segmentation +// This test checks that the propagation vectorization factor is not stopped by +// padding on non-innermost dimension, when the pad operation isn't the +// vectorized operation. TEST_F(ResizeTest, PropagatePadNonInnermost) { +// Fusion fusion; +// FusionGuard fg(&fusion); +// +// const std::vector shape({1024L, 1024L, 2L}); +// +// // Using a concrete tensor to avoid dynamic resize +// auto tv0 = makeContigConcreteTensor(shape); +// fusion.addInput(tv0); +// auto tv1 = relu(tv0); +// auto tv2 = +// pad(tv1, +// {IrBuilder::create(0L), +// IrBuilder::create(0L), +// IrBuilder::create(3L), +// IrBuilder::create(3L), +// IrBuilder::create(0L), +// IrBuilder::create(0L)}); +// fusion.addOutput(tv2); +// +// auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); +// auto t0 = at::randn(shape, options); +// std::vector aten_inputs({t0}); +// auto cg_outputs = +// scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs; +// +// FusionExecutorCache executor_cache(std::move(fusion_ptr)); +// auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); +// +// auto ref = at::pad(t0.relu(), {0, 0, 4, 4, 0, 0}); +// +// NVF_CHECK(ref.equal(cg_outputs[0])); +// // TODO: check vectorization factor +// } + } // namespace nvfuser