From d74a8bf21151840729c7e485e0f2c4f8ef6ac8d3 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Fri, 2 Aug 2024 18:32:54 +0000
Subject: [PATCH 1/7] Minor cleanup in executor.cpp - Add NVF_THROW - Add more
 information in bindInputs error - Add IValue debug string function - Cleanup
 some stale code

---
 csrc/exceptions.h       | 17 +++++---
 csrc/executor.cpp       | 94 +++++++++++++++++++---------------------
 csrc/executor.h         |  8 +++-
 csrc/executor_utils.cpp | 13 +++++-
 csrc/kernel_cache.h     |  7 ---
 csrc/utils.cpp          | 95 ++++++++++++-----------------------------
 csrc/utils.h            |  7 +--
 7 files changed, 103 insertions(+), 138 deletions(-)
diff --git a/csrc/exceptions.h b/csrc/exceptions.h
index 8084a1191fb..b472b281b83 100644
--- a/csrc/exceptions.h
+++ b/csrc/exceptions.h
@@ -253,17 +253,20 @@ inline const char* nvfCheckMsgImpl(const char* /*msg*/, const char* args) {
 #define STRINGIZE_IMPL(x) #x
 #define STRINGIZE(x) STRINGIZE_IMPL(x)
 
-#define NVF_ERROR(cond, ...)                                  \
-  if ((!(cond))) {                                            \
-    nvfuser::nvfErrorFail(                                  \
+#define NVF_THROW(...) \
+  nvfuser::nvfErrorFail(                                    \
         __FUNCTION__,                                       \
         __FILE__,                                           \
         static_cast<uint32_t>(__LINE__),                    \
-        #cond " INTERNAL ASSERT FAILED at " \
-        STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \
+        " INTERNAL ASSERT FAILED at "                       \
+        STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__)         \
         ", please report a bug with repro script to NVFuser at " \
-        "https://github.com/NVIDIA/Fuser/issues. ", \
-        nvfuser::to_str(__VA_ARGS__)); \
+        "https://github.com/NVIDIA/Fuser/issues. ",         \
+        nvfuser::to_str(__VA_ARGS__));
+
+#define NVF_ERROR(cond, ...) \
+  if ((!(cond))) {           \
+    NVF_THROW(__VA_ARGS__)   \
   }
 
 #define NVF_CHECK_MSG(cond, type, ...) \
diff --git a/csrc/executor.cpp b/csrc/executor.cpp
index dc0eac24575..a9768bcb1a0 100644
--- a/csrc/executor.cpp
+++ b/csrc/executor.cpp
@@ -1353,36 +1353,38 @@ std::vector<FusionExecutor::GlobalBufferInfo> FusionExecutor::
 
 namespace {
 
+FusionExecutor::GlobalBufferInfo getBufferInfo(
+    ExpressionEvaluator& expr_eval,
+    DataType index_dtype,
+    TensorView* tv) {
+  FusionExecutor::GlobalBufferInfo info;
+  info.tv = tv;
+  std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
+  auto dtype =
+      (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
+  info.type = data_type_to_aten(dtype);
+  return info;
+}
+
 //! Return information necessary for allocating output tensors. Input
 //! and output tensors are allowed to alias each other, which is
 //! specified by the list of int pairs of input and output indices
-std::vector<FusionExecutor::GlobalBufferInfo> getOutputBufferInfo(
-    const KernelArgumentHolder& args,
+std::vector<FusionExecutor::GlobalBufferInfo> getBufferInfos(
     ExpressionEvaluator& expr_eval,
     DataType index_dtype,
-    const Fusion* fusion) {
+    const std::vector<Val*>& fusion_outputs) {
   FUSER_PERF_SCOPE("FusionExecutor::getOutbufferInfo");
-  std::vector<FusionExecutor::GlobalBufferInfo> outputs;
-  outputs.reserve(fusion->outputs().size());
-  NVF_ERROR(
-      args.size() == fusion->inputs().size(),
-      "fusion arguments length does not match runtime arguments.");
-  for (const auto out_i : c10::irange(fusion->outputs().size())) {
-    auto out_val = fusion->outputs()[out_i];
+  std::vector<FusionExecutor::GlobalBufferInfo> output_buffer_infos;
+  output_buffer_infos.reserve(fusion_outputs.size());
+  for (const auto out : fusion_outputs) {
     NVF_ERROR(
-        out_val->isA<TensorView>(),
+        out->isA<TensorView>(),
         "Cannot allocate outputs that are not tensors.");
 
-    FusionExecutor::GlobalBufferInfo info;
-    info.tv = out_val->as<TensorView>();
-    std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
-    auto dtype =
-        (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
-    info.type = data_type_to_aten(dtype);
-
-    outputs.emplace_back(info);
+    output_buffer_infos.emplace_back(
+        getBufferInfo(expr_eval, index_dtype, out->as<TensorView>()));
   }
-  return outputs;
+  return output_buffer_infos;
 }
 
 } // namespace
@@ -1395,7 +1397,7 @@ std::vector<at::Tensor> allocOutputSpace(
   auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion);
 
   auto output_info =
-      getOutputBufferInfo(fusion_inputs, expr_eval, PrimDataType::Int, fusion);
+      getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs());
 
   return allocateOutputs(fusion, output_info, device, expr_eval);
 }
@@ -1427,8 +1429,8 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(
 
   auto arg_index_type = args.getSmallestIndexTypeOfArguments();
 
-  KernelArgumentHolder ret;
-  ret.setDeviceIndex(args.getDeviceIndex());
+  KernelArgumentHolder output_tensor_proxies;
+  output_tensor_proxies.setDeviceIndex(args.getDeviceIndex());
 
   for (Val* output : fusion->outputs()) {
     NVF_ERROR(
@@ -1439,9 +1441,9 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(
     const auto dtype = (output_tv->dtype() == DataType::Index)
         ? data_type_to_aten(arg_index_type)
         : data_type_to_aten(output_tv->dtype());
-    ret.pushTensorProxy(sizes, strides, dtype);
+    output_tensor_proxies.pushTensorProxy(sizes, strides, dtype);
   }
-  return ret;
+  return output_tensor_proxies;
 }
 
 namespace {
@@ -1553,15 +1555,6 @@ void dumpKernelArgs(
   }
 }
 
-FusionExecutor::GlobalBufferInfo getGlobalBufferAllocationInfo(
-    const at::Tensor& at_tensor) {
-  FusionExecutor::GlobalBufferInfo info{
-      .sizes = at_tensor.sizes().vec(),
-      .strides = at_tensor.strides().vec(),
-      .type = at_tensor.scalar_type()};
-  return info;
-}
-
 } // namespace
 
 void FusionExecutor::initializeExecutorEntry(
@@ -1591,13 +1584,16 @@ void FusionExecutor::initializeExecutorEntry(
 
   if (outputs.empty()) {
     output_info =
-        getOutputBufferInfo(args, expr_eval, index_type, lowered_->kernel());
+        getBufferInfos(expr_eval, index_type, lowered_->kernel()->outputs());
   } else {
     // Need to save the information necessary for allocations as
     // future uses of this ExecutorEntry may not be provided with
     // allocated outputs
     for (const auto& output : outputs) {
-      output_info.emplace_back(getGlobalBufferAllocationInfo(output));
+      output_info.emplace_back(FusionExecutor::GlobalBufferInfo{
+          .sizes = output.sizes().vec(),
+          .strides = output.strides().vec(),
+          .type = output.scalar_type()});
     }
   }
 
@@ -1853,10 +1849,13 @@ void FusionExecutor::resetCompiledKernelProperties() {
 }
 
 std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
-    KernelArgumentHolder& args,
     std::vector<at::Tensor> outputs,
     ExpressionEvaluator& expr_eval) {
-  // TODO: Add relevant profiling code.
+  FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs");
+  NVF_ERROR(
+      outputs.empty(),
+      "Fusion executor is using expression evaluator,",
+      " and expects that the outputs are not populated, which they were.");
   if (outputs.empty()) {
     for (const auto& out_val : fusion()->outputs()) {
       auto out_tensor =
@@ -1864,11 +1863,12 @@ std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
       outputs.emplace_back(out_tensor);
     }
   }
-  args.push(outputs);
   return outputs;
 }
 
 namespace {
+// Host IR specific function, returns the at:Tensor (ordered list) associated
+// with the provdied Fusion output tv
 at::Tensor findBufferForFusionOutput(
     const std::vector<at::Tensor>& out_tensors,
     const Val* fusion_out,
@@ -1906,14 +1906,10 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       " provided number of outputs does not match fusion output");
 
   // Bind fusion inputs
-  ExpressionEvaluator expr_eval;
-  const auto& inputs = fusion()->inputs();
-  for (const auto i : c10::irange(inputs.size())) {
-    expr_eval.bind(inputs[i], *args[i]);
-  }
+  auto expr_eval = executor_utils::bindInputs(args, fusion());
 
   if (isExpressionEvaluated(fusion())) {
-    outputs = evaluateFusionOutputs(args, outputs, expr_eval);
+    outputs = evaluateFusionOutputs(outputs, expr_eval);
     if (isProfilerEnabled()) {
       auto& sprof = FusionProfiler::segment(group_id_);
       sprof.stopKernel();
@@ -1924,8 +1920,8 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
   if (host_ir_container_ != nullptr) {
     if (outputs.empty()) {
-      std::vector<GlobalBufferInfo> output_info = getOutputBufferInfo(
-          args, expr_eval, PrimDataType::Int, host_ir_container_.get());
+      std::vector<GlobalBufferInfo> output_info = getBufferInfos(
+          expr_eval, PrimDataType::Int, host_ir_container_.get()->outputs());
       outputs = allocateOutputs(
           host_ir_container_.get(), output_info, options_.device, expr_eval);
     }
@@ -2012,7 +2008,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       // Skip trivially forwarded outputs because they are just placeholders
       continue;
     }
-    expr_eval.bind(output, *args[inputs.size() + i]);
+    expr_eval.bind(output, *args[kernel()->inputs().size() + i]);
   }
 
   std::vector<at::Tensor> intermediates;
@@ -2066,7 +2062,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       intermediates.push_back(intermediate_buffer);
       expr_eval.bind(
           kernel()->summary().global_allocations.at(i)->buffer(),
-          *args[inputs.size() + outputs.size() + i]);
+          *args[kernel()->inputs().size() + outputs.size() + i]);
       if (buf_info.is_profile_buffer) {
         profile_buffer = intermediate_buffer;
       }
diff --git a/csrc/executor.h b/csrc/executor.h
index 97710667553..4817c64bb92 100644
--- a/csrc/executor.h
+++ b/csrc/executor.h
@@ -65,7 +65,7 @@ class FusionExecutor : public NonCopyable {
   //! Notes: 1. This API should ignore aliased outputs instead of
   //! pushing scalar int 0 as a place-holder.
   //! 2. This API does not allocate output in memory, but only returns the
-  //! inferred output sizes.
+  //! inferred output sizes. Used in kernel_cache.cpp.
   KernelArgumentHolder inferOutputSizes(
       Fusion* fusion,
       const KernelArgumentHolder& args,
@@ -118,10 +118,14 @@ class FusionExecutor : public NonCopyable {
 
   //! Computes fusion outputs through expression evaluator.
   std::vector<at::Tensor> evaluateFusionOutputs(
-      KernelArgumentHolder& args,
       std::vector<at::Tensor> outputs,
       ExpressionEvaluator& expr_eval);
 
+  // TODO: args shouldn't come in a reference here because we will append the
+  // outputs to be able to send it to the kernel. For now none of the users are
+  // reconsuming the args, so it is okay. It isn't done now because changing it
+  // from a reference makes a call as runFusion({}) ambiguous, and that is used
+  // in some places in the codebase.
   NVF_API std::vector<at::Tensor> runFusion(
       KernelArgumentHolder& args,
       const LaunchParams& launch_constraints = LaunchParams(),
diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp
index 752bcf73099..b12cf4134c4 100644
--- a/csrc/executor_utils.cpp
+++ b/csrc/executor_utils.cpp
@@ -705,7 +705,18 @@ ExpressionEvaluator bindInputs(
     // NOTE: we bind all inputs here, including at::Tensors. This means that
     // expr_eval will create a PolymorphicValue containing *args[i], which means
     // that at::Tensor's lifetime will be at least as long as that of expr_eval.
-    expr_eval.bind(inputs[i], *args[i], true);
+    try {
+      expr_eval.bind(inputs[i], *args[i], true);
+    } catch (const nvfError& e) {
+      std::stringstream ss;
+      ss << "When trying to run the provided host program,"
+         << " there was an error with the provided input " << i
+         << ". Provided input was:\n  ";
+      ss << PolymorphicValue_functions::toString(*args[i]);
+      ss << "\n  which does not match the expected input:\n  ";
+      ss << inputs[i]->toString() << "\n";
+      NVF_THROW(ss.str());
+    }
   }
 
   return expr_eval;
diff --git a/csrc/kernel_cache.h b/csrc/kernel_cache.h
index 6562cc7fb4a..c4f680133e0 100644
--- a/csrc/kernel_cache.h
+++ b/csrc/kernel_cache.h
@@ -670,13 +670,6 @@ class FusionExecutorCache {
   //! Deserialize Fusion Executor Cache using flatbuffers
   void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id);
 
-  //! Allocate the outputs of the Fusion given inputs
-  //! TODO: re-implement
-  std::vector<at::Tensor> allocOutputSpace(
-      const at::ArrayRef<c10::IValue>& inputs) {
-    return runFusionWithInputs(inputs);
-  }
-
  private:
   //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached
   //! entry in `FusionExecutor`
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index 2c372b95577..c3376148bc0 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -37,86 +37,47 @@ c10::ThreadPool* getThreadPool() {
   return &pool;
 }
 
-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
-void debugPrint(const c10::TensorTypePtr& type) {
-  std::stringstream sizes_s;
-  if (auto sizes = type->symbolic_sizes().sizes()) {
-    for (const auto& shape_symbol : *sizes) {
-      if (shape_symbol.is_static()) {
-        sizes_s << shape_symbol.static_size() << ", ";
-      } else {
-        sizes_s << "s(" << *reinterpret_cast<const int64_t*>(&shape_symbol)
-                << "), ";
-      }
-    }
-  } else {
-    sizes_s << "no size available";
-  }
-  debug() << "sizes:" << sizes_s.str() << std::endl;
-  if (const auto& stride_properties = type->stride_properties().sizes()) {
-    std::stringstream stride_s;
-    std::stringstream index_s;
-    std::stringstream contig_s;
-
-    for (const auto& stride_property : *stride_properties) {
-      if (stride_property.has_value() && stride_property->stride_.has_value()) {
-        stride_s << *stride_property->stride_ << ", ";
-      } else {
-        stride_s << "?, ";
-      }
-      if (stride_property.has_value() &&
-          stride_property->stride_index_.has_value()) {
-        index_s << *stride_property->stride_index_ << ", ";
-      } else {
-        index_s << "?, ";
-      }
-      if (stride_property.has_value() &&
-          stride_property->contiguous_.has_value()) {
-        contig_s << *stride_property->contiguous_ << ", ";
-      } else {
-        contig_s << "?, ";
-      }
-    }
-    debug() << "stride: " << stride_s.str() << std::endl;
-    debug() << "stride index: " << index_s.str() << std::endl;
-    debug() << "contiguous: " << contig_s.str() << std::endl;
-  } else {
-    debug() << "no stride properties available" << std::endl;
+std::string debug_str(const c10::IValue& val) {
+  if (val.isTensor()) {
+    return debug_str(val.toTensor());
   }
-}
-C10_DIAGNOSTIC_POP()
 
-bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
-  return tensor_type && tensor_type->dim().has_value() &&
-      tensor_type->dim().value() == 0;
+  std::stringstream out_s;
+  out_s << val.toScalar();
+  return out_s.str();
 }
 
-bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
-  auto opt_sizes = tensor_type->sizes().concrete_sizes();
-  if (opt_sizes.has_value()) {
-    auto sizes = opt_sizes.value();
-    for (const auto& size : sizes) {
-      if (size == 0) {
-        return true;
+std::string debug_str(const at::Tensor& tensor) {
+  std::stringstream ss;
+  ss << "Tensor:";
+  ss << " device: " << tensor.device();
+  ss << ", dtype: " << tensor.dtype();
+  ss << ", shape[";
+  for (size_t i = 0; i < tensor.sizes().size(); ++i) {
+    if (i != 0) {
+      ss << ", ";
+    }
+    ss << tensor.size(i);
+  }
+  ss << "]";
+
+  if (!tensor.is_contiguous()) {
+    ss << ", strides: [";
+    for (size_t i = 0; i < tensor.strides().size(); ++i) {
+      ss << tensor.stride(i);
+      if (i < tensor.strides().size() - 1) {
+        ss << ", ";
       }
     }
+    ss << "]";
   }
-  return false;
+  return ss.str();
 }
 
 bool is_cpu_scalar(const at::Tensor& tensor) {
   return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0;
 }
 
-bool is_cpu_scalar(const c10::TensorType& tensor_type) {
-  auto opt_device = tensor_type.device();
-  auto opt_dim = tensor_type.dim();
-  auto opt_numel = tensor_type.numel();
-  return opt_device.has_value() && opt_device->is_cpu() &&
-      opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 &&
-      opt_numel.value() == 1;
-}
-
 bool is_meta_scalar(const at::Tensor& tensor) {
   return tensor.device().is_meta() && tensor.numel() == 1 && tensor.dim() == 0;
 }
diff --git a/csrc/utils.h b/csrc/utils.h
index 321214c9857..44fbd4057d0 100644
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -52,13 +52,10 @@ namespace nvfuser {
 int getNumThreads();
 c10::ThreadPool* getThreadPool();
 
-void debugPrint(const c10::TensorTypePtr& type);
-
-bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
-bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
+std::string debug_str(const c10::IValue& val);
+std::string debug_str(const at::Tensor& tensor);
 
 bool is_cpu_scalar(const at::Tensor& tensor);
-bool is_cpu_scalar(const c10::TensorType& tensor_type);
 
 bool is_meta_scalar(const at::Tensor& tensor);
 

From 11df23aab724ecc8394e54ed0643e0556427d749 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 3 Aug 2024 18:00:47 -0400
Subject: [PATCH 2/7] Apply suggestions from code review

Cleanup printing.

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/executor.cpp |  2 +-
 csrc/utils.cpp    | 18 ++----------------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/csrc/executor.cpp b/csrc/executor.cpp
index a9768bcb1a0..b87edd51f66 100644
--- a/csrc/executor.cpp
+++ b/csrc/executor.cpp
@@ -1921,7 +1921,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   if (host_ir_container_ != nullptr) {
     if (outputs.empty()) {
       std::vector<GlobalBufferInfo> output_info = getBufferInfos(
-          expr_eval, PrimDataType::Int, host_ir_container_.get()->outputs());
+          expr_eval, PrimDataType::Int, host_ir_container_->outputs());
       outputs = allocateOutputs(
           host_ir_container_.get(), output_info, options_.device, expr_eval);
     }
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index c3376148bc0..4dcf877c1e9 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -52,24 +52,10 @@ std::string debug_str(const at::Tensor& tensor) {
   ss << "Tensor:";
   ss << " device: " << tensor.device();
   ss << ", dtype: " << tensor.dtype();
-  ss << ", shape[";
-  for (size_t i = 0; i < tensor.sizes().size(); ++i) {
-    if (i != 0) {
-      ss << ", ";
-    }
-    ss << tensor.size(i);
-  }
-  ss << "]";
+  ss << ", shape: " << tensor.sizes();
 
   if (!tensor.is_contiguous()) {
-    ss << ", strides: [";
-    for (size_t i = 0; i < tensor.strides().size(); ++i) {
-      ss << tensor.stride(i);
-      if (i < tensor.strides().size() - 1) {
-        ss << ", ";
-      }
-    }
-    ss << "]";
+    ss << ", strides: " << tensor.strides();
   }
   return ss.str();
 }

From 994315260a747d7341ded55a572409cd117f9259 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 28 Aug 2024 14:18:06 -0700
Subject: [PATCH 3/7] Improve error message in expression evaluator when
 propagating constant sizes through the exact graph.

---
 csrc/executor_utils.cpp                 |   6 +-
 csrc/expr_evaluator.cpp                 | 148 ++++++++++++++++++++++--
 tests/cpp/test_gpu3.cpp                 |   4 +-
 tests/python/opinfo_input_generators.py |   8 +-
 4 files changed, 152 insertions(+), 14 deletions(-)

diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp
index 0e659d1d11d..cac3ba928c0 100644
--- a/csrc/executor_utils.cpp
+++ b/csrc/executor_utils.cpp
@@ -724,8 +724,10 @@ ExpressionEvaluator bindInputs(
          << " there was an error with the provided input " << i
          << ". Provided input was:\n  ";
       ss << PolymorphicValue_functions::toString(*args[i]);
-      ss << "\n  which does not match the expected input:\n  ";
-      ss << inputs[i]->toString() << "\n";
+      ss << "\n  Fusion input is:\n  ";
+      ss << inputs[i]->toString();
+      ss << "\n  Expr eval provided the error:\n\"\"\"";
+      ss << e.msg() << "\"\"\"\n";
       NVF_THROW(ss.str());
     }
   }
diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index c59842b2037..81397f634bd 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -344,6 +344,143 @@ void ExpressionEvaluator::print() const {
   debug() << "--------------------\n\n";
 }
 
+namespace {
+// Error handling for
+// ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion)
+void handlePropagateError(
+    Fusion* fusion,
+    ExpressionEvaluator* expr_eval,
+    std::shared_ptr<VectorOfUniqueEntries<const IterDomain*>> id_set) {
+  std::unordered_map<const IterDomain*, int64_t> id_to_size;
+  std::set<int64_t> sizes;
+
+  for (const auto id : *id_set) {
+    auto eval_val = expr_eval->evaluate(id->extent());
+    if (eval_val.hasValue()) {
+      NVF_ERROR(
+          eval_val.is<int64_t>(),
+          "Invalid extent value found, while processing ID: ",
+          id);
+      id_to_size[id] = eval_val.as<int64_t>();
+      sizes.insert(eval_val.as<int64_t>());
+    }
+  }
+
+  std::stringstream err_msg;
+  err_msg
+      << "When trying to propagate constant tensor sizes through the graph a conflict was found with "
+      << sizes.size()
+      << " different sizes across dimensions that are expected to match.\n";
+
+  // Track which size is associated with which TV and IterDomain
+  std::unordered_map<
+      int64_t,
+      std::vector<std::pair<TensorView*, const IterDomain*>>>
+      size_to_info;
+
+  for (auto tv : ir_utils::allTvs(fusion)) {
+    for (const IterDomain* id : tv->domain()->allIDs()) {
+      if (auto it = id_to_size.find(id); it != id_to_size.end()) {
+        size_to_info[it->second].push_back({tv, id});
+      }
+    }
+  }
+
+  // Check which TensorViews mismatch and check if they're directly related.
+  // If so, the expression between them may be the problmeatic expression, and
+  // the error will point to that expression(s). Don't bother to simplify this
+  // check as it's only runs after an error is detected.
+  bool found_producer_consumer_issue = false;
+  // Assume producer/consumer relationship
+  for (auto [dim_size_1, tv_id_pairs_1] : size_to_info) {
+    for (auto [dim_size_2, tv_id_pairs_2] : size_to_info) {
+      if (dim_size_1 <= dim_size_2) {
+        // N^2 algorithm, only process when one size is less than the other,
+        // avoids duplicate entries.
+        continue;
+      }
+
+      bool tv1_is_consumer = false;
+      for (auto tv_id_pair_1 : tv_id_pairs_1) {
+        for (auto tv_id_pair_2 : tv_id_pairs_2) {
+          // Check for producer-consumer relationship
+          auto producer_tvs_of_tv1 =
+              ir_utils::producerTvsOf(tv_id_pair_1.first);
+          auto producer_tvs_of_tv2 =
+              ir_utils::producerTvsOf(tv_id_pair_2.first);
+          if (std::find(
+                  producer_tvs_of_tv1.begin(),
+                  producer_tvs_of_tv1.end(),
+                  tv_id_pair_2.first) != producer_tvs_of_tv1.end()) {
+            tv1_is_consumer = true;
+          } else if (
+              std::find(
+                  producer_tvs_of_tv2.begin(),
+                  producer_tvs_of_tv2.end(),
+                  tv_id_pair_1.first) == producer_tvs_of_tv2.end()) {
+            // No relationship found, skip
+            continue;
+          }
+
+          Expr* relationship = tv1_is_consumer
+              ? tv_id_pair_1.first->definition()
+              : tv_id_pair_2.first->definition();
+
+          // Found at least one consumer/producer relationship with mismatched
+          // sizes.
+          found_producer_consumer_issue = true;
+
+          // Produce error message. Normally I'd just use swap but keys in an
+          // unordered map are const, so doing some juggling with the strings
+          // instead.
+          std::stringstream tv1_error;
+          std::stringstream tv2_error;
+          tv1_error << " TV: " << tv_id_pair_1.first
+                    << " id: " << tv_id_pair_1.second
+                    << " found size: " << dim_size_1 << "\n";
+          tv2_error << " TV: " << tv_id_pair_2.first
+                    << " id: " << tv_id_pair_2.second
+                    << " found size: " << dim_size_2 << "\n";
+          err_msg << "  For Producer"
+                  << (tv1_is_consumer ? tv2_error.str() : tv1_error.str());
+          err_msg << "  For Consumer"
+                  << (tv1_is_consumer ? tv1_error.str() : tv2_error.str());
+          err_msg
+              << "  With producer-consumer relationship through the expression: "
+              << relationship << "\n";
+        }
+      }
+    }
+  }
+
+  if (found_producer_consumer_issue) {
+    NVF_THROW(err_msg.str());
+  }
+
+  if (size_to_info.size() > 1) {
+    for (const auto& [size, info_pairs] : size_to_info) {
+      err_msg << "Size " << size << " found for ID, in TV:\n";
+      for (auto info_pair : info_pairs) {
+        err_msg << "  " << info_pair.second << ", " << info_pair.first << "\n";
+      }
+    }
+    err_msg << "These sizes should all match.\n";
+  } else {
+    std::unordered_map<int64_t, std::vector<const IterDomain*>> size_to_ids;
+    err_msg
+        << "The following groups of IterDomains should all have the same size, but don't:\n";
+    for (const auto& [id, size] : id_to_size) {
+      size_to_ids[size].push_back(id);
+    }
+    for (const auto& [size, ids] : size_to_ids) {
+      err_msg << "  Size: " << size << " found for ids: " << ids << "\n";
+    }
+  }
+
+  NVF_THROW(err_msg.str());
+}
+} // namespace
+
 void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) {
   // We map Symbolic IterDomains here only if their extents match. This avoids
   // mapping between symbolic domains that might concretize to an (Iteration,
@@ -358,15 +495,10 @@ void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) {
       if (eval_val.hasValue()) {
         NVF_ERROR(eval_val.is<int64_t>(), "Invalid extent value");
         int64_t this_size = eval_val.as<int64_t>();
-        if (known_size != -1) {
-          NVF_ERROR(
-              known_size == this_size,
-              "Conflicting sizes: ",
-              known_size,
-              ", ",
-              this_size);
-        } else {
+        if (known_size == -1) {
           known_size = this_size;
+        } else if (known_size != this_size) {
+          handlePropagateError(fusion, this, set);
         }
       } else {
         unknown_vals.push_back(id->extent());
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index 4cc1ac113f4..710cdab9a64 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -3839,8 +3839,8 @@ TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) {
   {
     ASSERT_THAT(
         [&]() { matched_add(a, c); },
-        ::testing::ThrowsMessage<nvfuser::nvfError>(
-            ::testing::HasSubstr("Conflicting sizes")));
+        ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
+            "When trying to propagate constant tensor sizes through the graph a conflict was found with 2 different sizes across dimensions that are expected to match.")));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   }
 }
diff --git a/tests/python/opinfo_input_generators.py b/tests/python/opinfo_input_generators.py
index 779f84bfb3b..a3e7551116b 100644
--- a/tests/python/opinfo_input_generators.py
+++ b/tests/python/opinfo_input_generators.py
@@ -311,7 +311,11 @@ def cat_error_generator(op, dtype=torch.float32, requires_grad: bool = False, **
         "Unexpected number of dimensions",
     )
     # All tensors must have same shape except for the cat dimension
-    shape_mismatch = (([(2, 3), (4, 5)], 0), RuntimeError, "known_size == this_size")
+    shape_mismatch = (
+        ([(2, 3), (4, 5)], 0),
+        RuntimeError,
+        "When trying to propagate constant tensor sizes",
+    )
 
     error_cases = [
         empty_input_tensors,
@@ -898,7 +902,7 @@ def pad_error_generator(
     delete_all_pad_width = [-3, 0, 0, 0]
     yield SampleInput(
         make_arg(input_shape), delete_all_pad_width, make_number(dtype)
-    ), RuntimeError, "extent_int >= 0"
+    ), RuntimeError, "Invalid resized domain extent"
 
     too_many_pad_width = [1, 1, 1, 1, 1, 1]
     yield SampleInput(

From 97ef12195483ccf80b141b44377f0a1cc65ea9b9 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 31 Aug 2024 16:49:12 -0400
Subject: [PATCH 4/7] Apply suggestions from code review

Typos

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/expr_evaluator.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index 81397f634bd..9d01761c974 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -387,9 +387,9 @@ void handlePropagateError(
   }
 
   // Check which TensorViews mismatch and check if they're directly related.
-  // If so, the expression between them may be the problmeatic expression, and
-  // the error will point to that expression(s). Don't bother to simplify this
-  // check as it's only runs after an error is detected.
+  // If so, the expression between them may be the problematic expression, and
+  // the error will point to that expression(s). Don't bother to speed up this
+  // check as it only runs after an error is detected.
   bool found_producer_consumer_issue = false;
   // Assume producer/consumer relationship
   for (auto [dim_size_1, tv_id_pairs_1] : size_to_info) {

From 9bc2af8fda72454f2486d0f4c094506d05827ea1 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 2 Sep 2024 14:13:38 -0700
Subject: [PATCH 5/7] PR Feedback.

---
 csrc/expr_evaluator.cpp                 | 45 +++++++++++--------------
 tests/cpp/test_alias.cpp                |  2 ++
 tests/python/opinfo_input_generators.py |  2 +-
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index 441adb6c62a..0832d60a54a 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -358,7 +358,7 @@ namespace {
 void handlePropagateError(
     Fusion* fusion,
     ExpressionEvaluator* expr_eval,
-    std::shared_ptr<VectorOfUniqueEntries<const IterDomain*>> id_set) {
+    const std::shared_ptr<VectorOfUniqueEntries<const IterDomain*>>& id_set) {
   std::unordered_map<const IterDomain*, int64_t> id_to_size;
   std::set<int64_t> sizes;
 
@@ -408,31 +408,29 @@ void handlePropagateError(
         continue;
       }
 
-      bool tv1_is_consumer = false;
-      for (auto tv_id_pair_1 : tv_id_pairs_1) {
-        for (auto tv_id_pair_2 : tv_id_pairs_2) {
+      for (const auto& [tv1, id1] : tv_id_pairs_1) {
+        for (const auto& [tv2, id2] : tv_id_pairs_2) {
+          bool tv1_is_consumer = false;
+
           // Check for producer-consumer relationship
-          auto producer_tvs_of_tv1 =
-              ir_utils::producerTvsOf(tv_id_pair_1.first);
-          auto producer_tvs_of_tv2 =
-              ir_utils::producerTvsOf(tv_id_pair_2.first);
+          auto producer_tvs_of_tv1 = ir_utils::producerTvsOf(tv1);
+          auto producer_tvs_of_tv2 = ir_utils::producerTvsOf(tv2);
           if (std::find(
                   producer_tvs_of_tv1.begin(),
                   producer_tvs_of_tv1.end(),
-                  tv_id_pair_2.first) != producer_tvs_of_tv1.end()) {
+                  tv2) != producer_tvs_of_tv1.end()) {
             tv1_is_consumer = true;
           } else if (
               std::find(
                   producer_tvs_of_tv2.begin(),
                   producer_tvs_of_tv2.end(),
-                  tv_id_pair_1.first) == producer_tvs_of_tv2.end()) {
+                  tv1) == producer_tvs_of_tv2.end()) {
             // No relationship found, skip
             continue;
           }
 
-          Expr* relationship = tv1_is_consumer
-              ? tv_id_pair_1.first->definition()
-              : tv_id_pair_2.first->definition();
+          Expr* relationship =
+              tv1_is_consumer ? tv1->definition() : tv2->definition();
 
           // Found at least one consumer/producer relationship with mismatched
           // sizes.
@@ -443,11 +441,9 @@ void handlePropagateError(
           // instead.
           std::stringstream tv1_error;
           std::stringstream tv2_error;
-          tv1_error << " TV: " << tv_id_pair_1.first
-                    << " id: " << tv_id_pair_1.second
+          tv1_error << " TV: " << tv1 << " id: " << id1
                     << " found size: " << dim_size_1 << "\n";
-          tv2_error << " TV: " << tv_id_pair_2.first
-                    << " id: " << tv_id_pair_2.second
+          tv2_error << " TV: " << tv2 << " id: " << id2
                     << " found size: " << dim_size_2 << "\n";
           err_msg << "  For Producer"
                   << (tv1_is_consumer ? tv2_error.str() : tv1_error.str());
@@ -474,22 +470,19 @@ void handlePropagateError(
     }
     err_msg << "These sizes should all match.\n";
   } else {
-    std::unordered_map<int64_t, std::vector<const IterDomain*>> size_to_ids;
     err_msg
-        << "The following groups of IterDomains should all have the same size, but don't:\n";
-    for (const auto& [id, size] : id_to_size) {
-      size_to_ids[size].push_back(id);
-    }
-    for (const auto& [size, ids] : size_to_ids) {
-      err_msg << "  Size: " << size << " found for ids: " << ids << "\n";
-    }
+        << "Something went wrong trying to detect what went wrong!"
+        << " There should have been ID's in TVs that should match, but don't."
+        << " Somehow IDs were registered with the exact graph that aren't used in the Fusion."
+        << std::endl;
   }
 
   NVF_THROW(err_msg.str());
 }
 } // namespace
 
-void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion,
+void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(
+    Fusion* fusion,
     ExactLogicalDomainMap* exact_map) {
   // We map Symbolic IterDomains here only if their extents match. This avoids
   // mapping between symbolic domains that might concretize to an (Iteration,
diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp
index b68b3678023..0a8a427a6b8 100644
--- a/tests/cpp/test_alias.cpp
+++ b/tests/cpp/test_alias.cpp
@@ -838,6 +838,7 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) {
   FusionGuard fg(fusion.get());
 
   TensorView* in = TensorViewBuilder()
+                       .ndims(3)
                        .dtype(DataType::Float)
                        .contiguity({std::nullopt, std::nullopt, std::nullopt})
                        .shape({4, 5, 6})
@@ -861,6 +862,7 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) {
   FusionGuard fg(fusion.get());
 
   TensorView* in = TensorViewBuilder()
+                       .ndims(4)
                        .dtype(DataType::Float)
                        .contiguity({true, std::nullopt, std::nullopt, true})
                        .shape({2, 3, 5, 7})
diff --git a/tests/python/opinfo_input_generators.py b/tests/python/opinfo_input_generators.py
index a3e7551116b..46ec89f40ab 100644
--- a/tests/python/opinfo_input_generators.py
+++ b/tests/python/opinfo_input_generators.py
@@ -314,7 +314,7 @@ def cat_error_generator(op, dtype=torch.float32, requires_grad: bool = False, **
     shape_mismatch = (
         ([(2, 3), (4, 5)], 0),
         RuntimeError,
-        "When trying to propagate constant tensor sizes",
+        "Tried to bind to a value",
     )
 
     error_cases = [

From 1db8e06fa68176b69b23329338e511a1e4606e4e Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Fri, 6 Sep 2024 13:12:21 -0700
Subject: [PATCH 6/7] resolve bad conflict resolution.

---
 tests/cpp/test_alias.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp
index 0a8a427a6b8..b68b3678023 100644
--- a/tests/cpp/test_alias.cpp
+++ b/tests/cpp/test_alias.cpp
@@ -838,7 +838,6 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) {
   FusionGuard fg(fusion.get());
 
   TensorView* in = TensorViewBuilder()
-                       .ndims(3)
                        .dtype(DataType::Float)
                        .contiguity({std::nullopt, std::nullopt, std::nullopt})
                        .shape({4, 5, 6})
@@ -862,7 +861,6 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) {
   FusionGuard fg(fusion.get());
 
   TensorView* in = TensorViewBuilder()
-                       .ndims(4)
                        .dtype(DataType::Float)
                        .contiguity({true, std::nullopt, std::nullopt, true})
                        .shape({2, 3, 5, 7})

From 8f3ff98516b7a5d60cb3edf788ba42d4959677d2 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Fri, 6 Sep 2024 13:23:58 -0700
Subject: [PATCH 7/7] Minor fix.

---
 csrc/expr_evaluator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index 3b7f2b382a0..0fd62022098 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -386,7 +386,7 @@ void handlePropagateError(
       std::vector<std::pair<TensorView*, const IterDomain*>>>
       size_to_info;
 
-  for (auto tv : ir_utils::allTvs(fusion)) {
+  for (auto tv : fusion->allTvs()) {
     for (const IterDomain* id : tv->domain()->allIDs()) {
       if (auto it = id_to_size.find(id); it != id_to_size.end()) {
         size_to_info[it->second].push_back({tv, id});