From 2fb5539c886458f33e4008c67d3b77f96c4e1c21 Mon Sep 17 00:00:00 2001
From: jjsjann123 <jiej@nvidia.com>
Date: Tue, 12 Nov 2024 20:54:37 -0800
Subject: [PATCH] Adding resize(PadOp) vectorization analysis (#3321)

Adding **conditional** support of reszie in vectorization analysis. This
PR allows vectorized load on `PadOp` directly without using cache load.
This PR improves performance of generated kernel.

What's in this PR:
1. Add propagation rule for resize in vectorization analysis. The
propagation rule works as:
i. For supported resize: a). project the resize op to the frontier and
clear `(frontier.begin(), resize_position)`; b). add projected extent of
the new resize op as `gcd(id_from, resize_op->leftExpand(),
resize_op->rightExpand)`
ii. For unsupported resize: clear `[frontier.begin(), resize_position]`;
no behavior change.

2. updating TensorView::cacheAfter to opt-in a set of uses to cache
while leaving other uses unchanged. Necessary for cases where inputs are
used by PadOp as well as other operation that relies on cached load for
vectorization.

Follow up to #3261.
Work for supporting rope performance. [design
doc](https://docs.google.com/document/d/1tafRMNIXMmHlIGAiNlaPkYp6mZAzJ2Rh_NtARHbmYNA/edit?disco=AAABYEnV_ZY):

---------

Co-authored-by: Naoya Maruyama <naoyam@users.noreply.github.com>
---
 csrc/ir/interface_nodes.h           |   8 +-
 csrc/preseg_passes/move_pad.cpp     |   5 +-
 csrc/scheduler/utils.cpp            |  68 +++---
 csrc/scheduler/vectorize_helper.cpp |  81 +++++++-
 csrc/scheduler/vectorize_helper.h   |  10 +-
 csrc/tensor_view.cpp                |  43 +++-
 tests/cpp/test_resize.cpp           | 311 +++++++++++++++++++++++++++-
 7 files changed, 463 insertions(+), 63 deletions(-)
diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h
index f32211a166d..260af932cda 100644
--- a/csrc/ir/interface_nodes.h
+++ b/csrc/ir/interface_nodes.h
@@ -482,10 +482,16 @@ class NVF_API TensorView : public Val {
   //!
   //! @param op_type: memory operator to use for the inserted op between
   //!   the the data tensor and the cache tensor
+  //! @param cache_op: cache operator, see enum class CacheOp
+  //! @param propagate_allocation_domain: replay allocation domain on cached
+  //! load
+  //! @param cached_uses: if empty, cache all uses; otherwise, only try to cache
+  //! uses in cached_uses.
   TensorView* cacheAfter(
       LoadStoreOpType op_type = LoadStoreOpType::Set,
       CacheOp cache_op = CacheOp::Unspecified,
-      bool propagate_allocation_domain = true);
+      bool propagate_allocation_domain = true,
+      std::vector<Expr*> cached_uses = {});
 
   // For a fusion output with other uses, we want to avoid writing to global
   // memory and then reading the output again. We write to global memory
diff --git a/csrc/preseg_passes/move_pad.cpp b/csrc/preseg_passes/move_pad.cpp
index bb277e62566..e2773ac6ea2 100644
--- a/csrc/preseg_passes/move_pad.cpp
+++ b/csrc/preseg_passes/move_pad.cpp
@@ -287,7 +287,10 @@ TensorView* replayConcretePad(
 
   auto* new_out = IrBuilder::create<TensorView>(
       IrBuilder::create<TensorDomain>(
-          merged_root_ids, merged_logical_ids, merged_logical_ids),
+          merged_root_ids,
+          merged_logical_ids,
+          merged_logical_ids,
+          TensorDomain::getContiguityFilledWith(merged_logical_ids, true)),
       pad_tv->getDataType().value());
   IrBuilder::create<PadOp>(
       new_out,
diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
index 260f5813be7..667ea7f40f2 100644
--- a/csrc/scheduler/utils.cpp
+++ b/csrc/scheduler/utils.cpp
@@ -1198,12 +1198,36 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
   for (auto tv : in_tvs) {
     if (tv->uses().empty() || ir_utils::isTorchGatherLookupTv(tv) ||
         ir_utils::isIndexSelectLookupTv(tv) ||
-        ir_utils::isTvUsedByOpsOfType<SliceOp, SelectOp, PadOp>(tv)) {
+        ir_utils::isTvUsedByOpsOfType<SliceOp, SelectOp>(tv)) {
       // Right now, tensors that are input to the slice, select, and pad ops
       // can't be cached as they must be in global memory.
       continue;
     }
-    auto cached_tv = tv->cacheAfter();
+
+    // TODO: might need to reverse this when scheduler handles pad directly
+    // Do not insert a cache for pad as vectorization needs to be
+    // done directly.
+    //
+    // Note that this means that if an input is padded and also is
+    // used without padding, it will be read twice, once for pad and
+    // once more for caching load. It would make sense to use the PTX
+    // caching load instructions.
+    std::vector<Expr*> cached_uses;
+    for (auto use : tv->uses()) {
+      if (!use->isA<PadOp>()) {
+        cached_uses.push_back(use);
+      }
+    }
+
+    if (cached_uses.empty()) {
+      continue;
+    }
+
+    auto cached_tv = tv->cacheAfter(
+        /*op_type=*/LoadStoreOpType::Set,
+        /*cache_op=*/CacheOp::Unspecified,
+        /*propagate_allocation_domain=*/true,
+        /*cached_uses=*/cached_uses);
     cached_inputs.emplace_back(cached_tv);
   }
   return cached_inputs;
@@ -1290,12 +1314,7 @@ IterDomain* projectIdToRoot(
     } else if (expr->isA<Resize>()) {
       auto resize = expr->as<Resize>();
       if (resize->out() == projected_id) {
-        // We do not allow vectorization with resize at this moment
-        if (vectorize_pass) {
-          projected_id = nullptr;
-        } else {
-          projected_id = resize->in();
-        }
+        projected_id = resize->in();
       }
     } else {
       NVF_THROW("Didn't recognize the iterdomain expression: ", expr);
@@ -1350,12 +1369,7 @@ IterDomain* projectIdToRFactor(
     } else if (expr->isA<Resize>()) {
       auto resize = expr->as<Resize>();
       if (resize->in() == projected_id) {
-        // We do not allow vectorization wit resize at this moment
-        if (vectorize_pass) {
-          projected_id = nullptr;
-        } else {
-          projected_id = resize->out();
-        }
+        projected_id = resize->out();
       }
     } else {
       NVF_THROW("Didn't recognize the iterdomain expression: ", expr);
@@ -1549,12 +1563,6 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
   // scheduler prefer to use output instead of input as reference tensor.
   for (auto output_tv :
        ir_utils::filterByType<TensorView>(reference_tv->fusion()->outputs())) {
-    // At this moment, vectorization through resize is not
-    // supported. This is not required currently as we always insert
-    // cacheBefore, but just in case.
-    if (ir_utils::hasResizedRfactor(output_tv)) {
-      continue;
-    }
     if (hasInnerDim(output_tv, vectorizable_dims, vectorize_pass)) {
       vectorizable_tensors.push_back(output_tv);
     }
@@ -1569,19 +1577,11 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
       continue;
     }
 
-    auto expr_resizes = [](Expr* e) -> bool {
-      return std::any_of(
-          e->outputs().begin(), e->outputs().end(), [](Val* out) -> bool {
-            if (auto* out_tv = dynamic_cast<TensorView*>(out)) {
-              return ir_utils::hasResizedRfactor(out_tv);
-            }
-            return false;
-          });
-    };
-
-    // At this moment, vectorization through resize is not supported
-    if (std::any_of(
-            input_tv->uses().begin(), input_tv->uses().end(), expr_resizes)) {
+    // Slice op is explicitly not enabled for vectorized load.
+    if (std::all_of(
+            input_tv->uses().begin(),
+            input_tv->uses().end(),
+            [](Expr* e) -> bool { return e->isA<SliceOp>(); })) {
       continue;
     }
 
@@ -2385,7 +2385,7 @@ bool revertUseOfInputCache(
 void prepareForMemoryTypePromotion(Fusion* fusion) {
   auto non_pwise_pairs = getNonPointwiseProducerConsumerPairs(fusion);
 
-  // Inserting a copy of each proucer. If a tensor shows up as a
+  // Inserting a copy of each producer. If a tensor shows up as a
   // producer for multiple consumers, only insert one
   // copy and share it with all the consumers.
 
diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
index 26fa824ad36..3dcad2b497d 100644
--- a/csrc/scheduler/vectorize_helper.cpp
+++ b/csrc/scheduler/vectorize_helper.cpp
@@ -55,6 +55,30 @@ Val* ContiguousInnerDimensionsMapper::isFullyProjected(IterDomain* id) {
       getProjectedExtent(id), commonOrConstExtent(ca_map_, id));
 }
 
+void ContiguousInnerDimensionsMapper::initializeResizeInfo(Fusion* fusion) {
+  auto exprs = fusion->exprs();
+  for (auto* pad_op : ir_utils::filterByType<PadOp>(exprs)) {
+    if (!pad_op->out()->isA<TensorView>()) {
+      continue;
+    }
+
+    auto* out_tv = pad_op->out()->as<TensorView>();
+
+    auto consumer_exprs = StmtSort::getExprsBetween(
+        {out_tv->getMaybeRootDomain().begin(),
+         out_tv->getMaybeRootDomain().end()},
+        {out_tv->getLogicalDomain().begin(), out_tv->getLogicalDomain().end()});
+
+    // NOTE: if we can assume that PadOp is always on inputs, then we can skip
+    // to innermost resize instead.
+    auto resize_ops = ir_utils::filterByType<Resize>(consumer_exprs);
+    std::copy(
+        resize_ops.begin(),
+        resize_ops.end(),
+        std::inserter(resize_in_pad_, resize_in_pad_.end()));
+  }
+}
+
 ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper(
     TensorView* reference,
     const std::vector<IterDomain*>& ids,
@@ -67,6 +91,9 @@ ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper(
       ca_map_(std::move(ca_map)),
       divisible_splits_(divisible_splits) {
   FusionGuard fg(reference->fusion());
+
+  initializeResizeInfo(reference->fusion());
+
   // Exclude reduction IDs if the reference is a fusion input as they
   // don't manifest at all in the fusion. This simplifies the
   // analysis in getContigMergeOfInnerSize, which only looks at
@@ -365,9 +392,51 @@ std::vector<IterDomain*> ContiguousInnerDimensionsMapper::projectId(
     distributePE(merge_or_split);
   };
 
-  auto clear_left_of = [&frontier](IterDomain* id) {
-    auto it = std::find(frontier.begin(), frontier.end(), id);
-    if (it != frontier.end()) {
+  auto propagateResize = [&frontier, this](Resize* resize_op, bool p2c) {
+    IterDomain* id_from = p2c ? resize_op->in() : resize_op->out();
+    IterDomain* id_to = p2c ? resize_op->out() : resize_op->in();
+
+    auto it = std::find(frontier.begin(), frontier.end(), id_from);
+    if (it == frontier.end()) {
+      return;
+    }
+
+    auto pos = std::distance(frontier.begin(), it);
+    if (resize_in_pad_.count(resize_op) != 0) {
+      // resize created by PadOp.
+
+      // project resize op to frontier.
+      frontier[pos] = id_to;
+      // clear left of resize, since those are no long contiguous.
+      frontier.erase(frontier.begin(), it);
+
+      if (recording_) {
+        // TODO: support negative resize extent.
+        //
+        // Limit current support to only positive resize extent for now. So we
+        // only consider the pad_extent, which becomes the real buffer on
+        // output. Hence we do GCD among padded extent as well as extent of the
+        // id_from. Note since we are taking the GCD here, I don't think using
+        // id_from or id_to makes a difference.
+        auto consumer_factor = getProjectedExtent(id_from);
+        auto comp = [](Val* factor, Val* extent) {
+          return SimplifyingIrBuilder::whereExpr(
+              SimplifyingIrBuilder::eqExpr(
+                  extent, extent->container()->zeroVal()),
+              factor,
+              // for extent < 0, we'll take max(1, extent). Because of the gcd,
+              // This is effectively excluding the resize id from vectorization.
+              SimplifyingIrBuilder::gcdExpr(
+                  factor,
+                  SimplifyingIrBuilder::maxExpr(
+                      extent->container()->oneVal(), extent)));
+        };
+        consumer_factor = comp(consumer_factor, resize_op->leftExpand());
+        consumer_factor = comp(consumer_factor, resize_op->rightExpand());
+        addProjectedExtent(id_to, consumer_factor);
+      }
+    } else {
+      // unsupproted resize.
       frontier.erase(frontier.begin(), it + 1);
     }
   };
@@ -391,8 +460,7 @@ std::vector<IterDomain*> ContiguousInnerDimensionsMapper::projectId(
     } else if (Merge* merge = dynamic_cast<Merge*>(expr)) {
       propagateDistribute(merge);
     } else if (Resize* resize = dynamic_cast<Resize*>(expr)) {
-      // Cannot vectorize through resize
-      clear_left_of(resize->out());
+      propagateResize(resize, false);
     } else {
       // TODO: I wonder if we should just remove all inputs instead of erroring.
       // Seems that would be safe.
@@ -415,8 +483,7 @@ std::vector<IterDomain*> ContiguousInnerDimensionsMapper::projectId(
     } else if (Split* split = dynamic_cast<Split*>(expr)) {
       propagateDistribute(split);
     } else if (Resize* resize = dynamic_cast<Resize*>(expr)) {
-      // Cannot vectorize through resize
-      clear_left_of(resize->in());
+      propagateResize(resize, true);
     } else {
       // TODO: I wonder if we should just remove all inputs instead of erroring.
       // Seems that would be safe.
diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h
index 193df7786c9..d5c8e26c406 100644
--- a/csrc/scheduler/vectorize_helper.h
+++ b/csrc/scheduler/vectorize_helper.h
@@ -31,7 +31,7 @@ namespace vectorize_helper {
 
 // Projects IterDomains through the fusion starting at provided reference. IDs
 // in the reference are expected to be "contiguous", simply means dimensions
-// that the iter domains are consecutive and next to eachother in the
+// that the iter domains are consecutive and next to each other in the
 // reference. This property is not enforced, but mapping can have some
 // unpredictbale results if they are not. The reason we want contiguity here
 // is this class is primarily used for vectorization analysis. Domains may be
@@ -78,7 +78,7 @@ namespace vectorize_helper {
 //   tv1[2*3, 5, 7*11] = view(tv0)
 // with tv1 and [2*3, 7*11] as the reference and ids. tv0's 2 and 11 dim are
 // easily identified as being mapped. The 3*5*7 dimension however, is
-// partially mapped on the left and right side. Since this class is  intended to
+// partially mapped on the left and right side. Since this class is intended to
 // line up "inner dimensions" of tensors through out the graph for the purpose
 // of unrolling and vectorization, it only tracks partial dimensions as they are
 // on the right hand side of iteration domains. For example in the last case we
@@ -289,6 +289,9 @@ class NVF_API ContiguousInnerDimensionsMapper
   void propagateP2C(TensorView* from, TensorView* to) final;
   void propagateSibling(TensorView* from, TensorView* to) final;
 
+  // traverse fusion to mark the origin of Resize
+  void initializeResizeInfo(Fusion* fusion);
+
   // Initialized to false, series of compute... calls will be performed to find
   // the spanning tree. Then propagate... calls will call the compute... calls.
   // recording_ starts as false, and stays that way during the first series of
@@ -308,6 +311,9 @@ class NVF_API ContiguousInnerDimensionsMapper
       tv_infos_;
 
   std::unordered_map<IterDomain*, Val*> projected_extent_;
+
+  //! stores all Resize* op that's added from PadOp*
+  std::unordered_set<Resize*> resize_in_pad_;
 };
 
 // logical_reorder_map is provided to assume reference_tv will be reordered per
diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
index b85c4947b1c..afab5f5a598 100644
--- a/csrc/tensor_view.cpp
+++ b/csrc/tensor_view.cpp
@@ -1169,15 +1169,37 @@ TensorView* TensorView::cacheFork() {
 TensorView* TensorView::cacheAfter(
     LoadStoreOpType op_type,
     CacheOp cache_op,
-    bool propagate_allocation_domain) {
+    bool propagate_allocation_domain,
+    std::vector<Expr*> cached_uses) {
   NVF_ERROR(
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
   FusionGuard fg(fusion());
 
+  if (!cached_uses.empty()) {
+    std::unordered_set<Expr*> unique_uses = fusion()->unordered_uses(this);
+    for (auto use : cached_uses) {
+      NVF_ERROR(
+          unique_uses.count(use),
+          "cached_uses is not among the use of the TensorView");
+    }
+  } else {
+    // avoid non-determinism and ensure unique
+    std::unordered_set<Expr*> unique_uses;
+    auto this_uses = uses();
+    cached_uses.reserve(this_uses.size());
+    for (Expr* use : this_uses) {
+      NVF_ERROR(
+          unique_uses.count(use) == 0,
+          "detect duplicated entries in TensorView::uses()");
+      cached_uses.push_back(use);
+      unique_uses.insert(use);
+    }
+  }
+
   // Get all the uses for this Tensorview
   NVF_CHECK(
-      !uses().empty(),
+      !cached_uses.empty(),
       "Error adding cacheAfter ",
       this,
       " we restrict using cacheAfter on tensors that have no further uses.");
@@ -1188,18 +1210,19 @@ TensorView* TensorView::cacheAfter(
       !hasComputeAt(),
       "Caching computed-at tensors is not allowed. Apply caching before computeAt.");
 
-  bool is_allowed_op =
-      !ir_utils::isTvUsedByOpsOfType<SliceOp, SelectOp, PadOp>(this) &&
-      !ir_utils::isIndexSelectLookupTv(this);
-  NVF_CHECK(
-      is_allowed_op,
-      "Right now, caching tensors that are input to the select/slice/pad ops are not allowed as they must be in global memory.")
+  // disallow cache on operation where we require data remain in global memory.
+  for (auto use : cached_uses) {
+    NVF_ERROR(
+        !(use->isOneOf<SliceOp, SelectOp, PadOp>()) &&
+            !(use->isA<IndexSelectOp>() && use->input(0) == this),
+        "Right now, caching tensors that are input to the select/slice/pad ops are not allowed as they must be in global memory.");
+  }
 
   // It also did additional transformation when this tensor is an
   // input and the outputs of its consumers have computeAt. Make sure
   // we no longer rely on that behavior.
   if (isFusionInput()) {
-    for (const auto& expr : uses()) {
+    for (const auto& expr : cached_uses) {
       for (TensorView* output :
            ir_utils::filterByType<TensorView>(expr->outputs())) {
         NVF_CHECK(
@@ -1242,7 +1265,7 @@ TensorView* TensorView::cacheAfter(
   // After:  This TV -> [Set Op] -> New CA TV -> [Use Op] -> Next TV
 
   // Expr* consumer_uses =
-  for (auto expr : fusion()->unordered_uses(this)) {
+  for (auto expr : cached_uses) {
     ir_utils::replaceValInExprInputs(expr, this, consumer);
   }
 
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index ed02e67ee5f..09558e05cf2 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -2660,7 +2660,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) {
   const int64_t slice_offset = 4;
   const std::vector<int64_t> shape({1024L * 1024L});
 
-  // Using a concrete tensor to avoid dynamic reshape
+  // Using a concrete tensor to avoid dynamic resize
   auto tv0 = makeContigConcreteTensor(shape);
   fusion.addInput(tv0);
 
@@ -3439,9 +3439,8 @@ TEST_F(ResizeTest, SqueezeSlicedExpand) {
       __FILE__);
 }
 
-// Vectorization through resize is not supported yet. Make sure
-// vectorization is disabled.
-TEST_F(ResizeTest, AvoidVectorization) {
+// Vectorization through pad is supported now!
+TEST_F(ResizeTest, PadVectorization) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3471,9 +3470,8 @@ TEST_F(ResizeTest, AvoidVectorization) {
   // Make sure tv1 is not vectorized, i.e., no loop IterDomains are vectorized.
   EXPECT_THAT(
       tv1->getLoopDomain(),
-      Each(
-          Property(&IterDomain::getParallelType, Not(ParallelType::Vectorize))))
-      << "Unexpected vectorization: " << tv1;
+      Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
+      << "Failed to vectorize: " << tv1;
 
   // Make sure tv2 should be vectorized, i.e., at least one loop IterDomain is
   // vectorized.
@@ -4093,7 +4091,6 @@ TEST_F(ResizeTest, VectorizeWhereLowering) {
   FusionGuard fg(fusion_ptr.get());
 
   const std::vector<int64_t> shape({1024L * 1024L});
-
   // Note: nvfuser currently only supports vectorization with a single
   // TensorView input.
   auto s0 = IrBuilder::create<Val>(DataType::Bool);
@@ -4123,4 +4120,302 @@ TEST_F(ResizeTest, VectorizeWhereLowering) {
   ASSERT_TRUE(t0.equal(cg_outputs[0]));
 }
 
+TEST_F(ResizeTest, VectorizeFactorFour) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape({1024L * 1024L});
+
+  // Using a concrete tensor to avoid dynamic resize
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = pad(tv0, {IrBuilder::create<Val>(4L), IrBuilder::create<Val>(4L)});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+
+  // check that we vectorize 4
+  bool found_vectorize = false;
+  auto exprs = fusion.exprs();
+  auto pad_ops = ir_utils::filterByType<PadOp>(exprs).vector();
+  EXPECT_EQ(pad_ops.size(), 1);
+  EXPECT_TRUE(pad_ops.at(0)->out()->isA<TensorView>());
+  for (auto id : pad_ops.at(0)->out()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 4);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+// This test is to check that the pad extent is used to limit the vectorization
+// factor.
+TEST_F(ResizeTest, VectorizeFactorTwo) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape({1024L * 1024L});
+
+  // Using a concrete tensor to avoid dynamic resize
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // pad extent would restrict vectorization factor
+  auto tv1 = pad(tv0, {IrBuilder::create<Val>(2L), IrBuilder::create<Val>(2L)});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+
+  // check that we vectorize 2
+  bool found_vectorize = false;
+  auto exprs = fusion.exprs();
+  auto pad_ops = ir_utils::filterByType<PadOp>(exprs).vector();
+  EXPECT_EQ(pad_ops.size(), 1);
+  EXPECT_TRUE(pad_ops.at(0)->out()->isA<TensorView>());
+  for (auto id : pad_ops.at(0)->out()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 2);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+// This test is to check that the pad with 0-extent
+TEST_F(ResizeTest, VectorizeFactorTwoPadZeroExtent) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape({1024L * 1024L});
+
+  // Using a concrete tensor to avoid dynamic resize
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // pad extent would restrict vectorization factor
+  auto tv1 = pad(tv0, {IrBuilder::create<Val>(0L), IrBuilder::create<Val>(2L)});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+
+  // check that we vectorize 2
+  bool found_vectorize = false;
+  auto exprs = fusion.exprs();
+  auto pad_ops = ir_utils::filterByType<PadOp>(exprs).vector();
+  EXPECT_EQ(pad_ops.size(), 1);
+  EXPECT_TRUE(pad_ops.at(0)->out()->isA<TensorView>());
+  for (auto id : pad_ops.at(0)->out()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 2);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+TEST_F(ResizeTest, VectorizePadNonInnermost) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape({1024L, 1024L, 2L});
+
+  // Using a concrete tensor to avoid dynamic resize
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 =
+      pad(tv0,
+          {IrBuilder::create<Val>(0L),
+           IrBuilder::create<Val>(0L),
+           IrBuilder::create<Val>(4L),
+           IrBuilder::create<Val>(4L),
+           IrBuilder::create<Val>(0L),
+           IrBuilder::create<Val>(0L)});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+
+  // check that we vectorize 4
+  bool found_vectorize = false;
+  auto exprs = fusion.exprs();
+  auto pad_ops = ir_utils::filterByType<PadOp>(exprs).vector();
+  EXPECT_EQ(pad_ops.size(), 1);
+  EXPECT_TRUE(pad_ops.at(0)->out()->isA<TensorView>());
+  for (auto id : pad_ops.at(0)->out()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 4);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+// padding with negative extent should prevent us considering the resize id for
+// vectorization. So the example below should only have a vectorization factor
+// of 2
+TEST_F(ResizeTest, VectorizePadNonInnermostNegativeExtent) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape({1024L, 1024L, 2L});
+
+  // Using a concrete tensor to avoid dynamic resize
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 =
+      pad(tv0,
+          {IrBuilder::create<Val>(0L),
+           IrBuilder::create<Val>(0L),
+           IrBuilder::create<Val>(-4L),
+           IrBuilder::create<Val>(4L),
+           IrBuilder::create<Val>(0L),
+           IrBuilder::create<Val>(0L)});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+
+  // check that we vectorize 4
+  bool found_vectorize = false;
+  auto exprs = fusion.exprs();
+  auto pad_ops = ir_utils::filterByType<PadOp>(exprs).vector();
+  EXPECT_EQ(pad_ops.size(), 1);
+  EXPECT_TRUE(pad_ops.at(0)->out()->isA<TensorView>());
+  for (auto id : pad_ops.at(0)->out()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 2);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+TEST_F(ResizeTest, PadAndCacheUses) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape({1024L * 1024L});
+
+  // Using a concrete tensor to avoid dynamic resize
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = pad(tv0, {IrBuilder::create<Val>(4L), IrBuilder::create<Val>(4L)});
+  fusion.addOutput(tv1);
+  auto tv2 = relu(tv0);
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+
+  // check that pad vectorize 4
+  bool found_vectorize = false;
+  auto exprs = fusion.exprs();
+  auto pad_ops = ir_utils::filterByType<PadOp>(exprs).vector();
+  EXPECT_EQ(pad_ops.size(), 1);
+  EXPECT_TRUE(pad_ops.at(0)->out()->isA<TensorView>());
+  for (auto id : pad_ops.at(0)->out()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 4);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  // check that relu vectorize 4
+  found_vectorize = false;
+  auto uops = ir_utils::filterByType<UnaryOp>(exprs).vector();
+  EXPECT_EQ(uops.size(), 1);
+  EXPECT_TRUE(uops.at(0)->in()->isA<TensorView>());
+  for (auto id : uops.at(0)->in()->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 4);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+// we cannot yet test this one, as pad in the middle causes segmentation
+// This test checks that the propagation vectorization factor is not stopped by
+// padding on non-innermost dimension, when the pad operation isn't the
+// vectorized operation. TEST_F(ResizeTest, PropagatePadNonInnermost) {
+//   Fusion fusion;
+//   FusionGuard fg(&fusion);
+//
+//   const std::vector<int64_t> shape({1024L, 1024L, 2L});
+//
+//   // Using a concrete tensor to avoid dynamic resize
+//   auto tv0 = makeContigConcreteTensor(shape);
+//   fusion.addInput(tv0);
+//   auto tv1 = relu(tv0);
+//   auto tv2 =
+//       pad(tv1,
+//           {IrBuilder::create<Val>(0L),
+//            IrBuilder::create<Val>(0L),
+//            IrBuilder::create<Val>(3L),
+//            IrBuilder::create<Val>(3L),
+//            IrBuilder::create<Val>(0L),
+//            IrBuilder::create<Val>(0L)});
+//   fusion.addOutput(tv2);
+//
+//   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+//   auto t0 = at::randn(shape, options);
+//   std::vector<c10::IValue> aten_inputs({t0});
+//   auto cg_outputs =
+//       scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+//
+//   FusionExecutorCache executor_cache(std::move(fusion_ptr));
+//   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+//
+//   auto ref = at::pad(t0.relu(), {0, 0, 4, 4, 0, 0});
+//
+//   NVF_CHECK(ref.equal(cg_outputs[0]));
+//   // TODO: check vectorization factor
+// }
+
 } // namespace nvfuser