diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp
index 1afa860b0d2..5d040527646 100644
--- a/csrc/device_lower/validation.cpp
+++ b/csrc/device_lower/validation.cpp
@@ -554,9 +554,13 @@ class VectorizeValidator : public OptInDispatch {
     auto ldst = dynamic_cast<LoadStoreOp*>(tv->definition());
     bool is_ldmatrix_trans =
         ldst != nullptr && mma_utils::isLdMatrixTranspose(ldst);
-    if (!is_ldmatrix_trans) {
+    if (!is_ldmatrix_trans && name.compare("consumer") != 0) {
       // ldmatrix.trans is a hardware transpose instruction that can do
       // "vectorized" read from discontiguous memory
+      // We don't think allocation domain of consumer is used in allocation. We
+      // skip it in validation here. Note that this assert was hit for
+      // vectorized pad, because we do not propagate allocation domain for
+      // PadOp. See: https://github.com/NVIDIA/Fuser/pull/3439
       NVF_CHECK(
           last_alloc_dim == vec_alloc_id,
           "Vectorized dim for ",
diff --git a/tests/cpp/test_pointwise.cpp b/tests/cpp/test_pointwise.cpp
index c4684adbb6e..bb1c6bd7bfb 100644
--- a/tests/cpp/test_pointwise.cpp
+++ b/tests/cpp/test_pointwise.cpp
@@ -730,4 +730,47 @@ INSTANTIATE_TEST_SUITE_P(
       ss << "_outer_unroll_" << std::get<2>(info.param);
       return sanitizeTestName(ss.str());
     });
+
+TEST_F(PointwiseTest, VectorizePadLoweringPermuted) {
+  // Pointwise scheduler applies permutation to restore contiguous memory access
+  // on reference TV. Vectorization validation requires vectorized operations to
+  // preserve the allocation domain of their inputs. This test checks that PadOp
+  // propagates the allocation domain properly.
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  // input is permuted
+  auto tv0 = TensorViewBuilder()
+                 .shape({1024, 1024})
+                 .dtype(DataType::Float)
+                 .contiguity(true)
+                 .strideOrder({0, 1})
+                 .build();
+  fusion.addInput(tv0);
+  auto tv1 = pad(tv0, {IrBuilder::create<Val>(4L), IrBuilder::create<Val>(4L)});
+  auto tv2 = relu(tv1);
+  fusion.addOutput(tv2);
+  // output is permuted
+  tv2->setAllocationDomain({tv2->axis(1), tv2->axis(0)}, true);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 =
+      at::randn({1024 * 1024}, options).as_strided({1024, 1024}, {1, 1024});
+  std::vector<c10::IValue> aten_inputs({t0});
+
+  auto cg_outputs =
+      scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs).outputs;
+  // check that we vectorize 4
+  bool found_vectorize = false;
+  for (auto id : fusion.outputs().at(0)->as<TensorView>()->getLoopDomain()) {
+    if (id->getParallelType() == ParallelType::Vectorize) {
+      EXPECT_EQ(id->extent()->evaluate(), 4);
+      found_vectorize = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found_vectorize);
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
 } // namespace nvfuser