From d74a8bf21151840729c7e485e0f2c4f8ef6ac8d3 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Fri, 2 Aug 2024 18:32:54 +0000 Subject: [PATCH 1/7] Minor cleanup in executor.cpp - Add NVF_THROW - Add more information in bindInputs error - Add IValue debug string function - Cleanup some stale code --- csrc/exceptions.h | 17 +++++--- csrc/executor.cpp | 94 +++++++++++++++++++--------------------- csrc/executor.h | 8 +++- csrc/executor_utils.cpp | 13 +++++- csrc/kernel_cache.h | 7 --- csrc/utils.cpp | 95 ++++++++++++----------------------------- csrc/utils.h | 7 +-- 7 files changed, 103 insertions(+), 138 deletions(-) diff --git a/csrc/exceptions.h b/csrc/exceptions.h index 8084a1191fb..b472b281b83 100644 --- a/csrc/exceptions.h +++ b/csrc/exceptions.h @@ -253,17 +253,20 @@ inline const char* nvfCheckMsgImpl(const char* /*msg*/, const char* args) { #define STRINGIZE_IMPL(x) #x #define STRINGIZE(x) STRINGIZE_IMPL(x) -#define NVF_ERROR(cond, ...) \ - if ((!(cond))) { \ - nvfuser::nvfErrorFail( \ +#define NVF_THROW(...) \ + nvfuser::nvfErrorFail( \ __FUNCTION__, \ __FILE__, \ static_cast(__LINE__), \ - #cond " INTERNAL ASSERT FAILED at " \ - STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \ + " INTERNAL ASSERT FAILED at " \ + STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \ ", please report a bug with repro script to NVFuser at " \ - "https://github.com/NVIDIA/Fuser/issues. ", \ - nvfuser::to_str(__VA_ARGS__)); \ + "https://github.com/NVIDIA/Fuser/issues. ", \ + nvfuser::to_str(__VA_ARGS__)); + +#define NVF_ERROR(cond, ...) \ + if ((!(cond))) { \ + NVF_THROW(__VA_ARGS__) \ } #define NVF_CHECK_MSG(cond, type, ...) \ diff --git a/csrc/executor.cpp b/csrc/executor.cpp index dc0eac24575..a9768bcb1a0 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1353,36 +1353,38 @@ std::vector FusionExecutor:: namespace { +FusionExecutor::GlobalBufferInfo getBufferInfo( + ExpressionEvaluator& expr_eval, + DataType index_dtype, + TensorView* tv) { + FusionExecutor::GlobalBufferInfo info; + info.tv = tv; + std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval); + auto dtype = + (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); + info.type = data_type_to_aten(dtype); + return info; +} + //! Return information necessary for allocating output tensors. Input //! and output tensors are allowed to alias each other, which is //! specified by the list of int pairs of input and output indices -std::vector getOutputBufferInfo( - const KernelArgumentHolder& args, +std::vector getBufferInfos( ExpressionEvaluator& expr_eval, DataType index_dtype, - const Fusion* fusion) { + const std::vector& fusion_outputs) { FUSER_PERF_SCOPE("FusionExecutor::getOutbufferInfo"); - std::vector outputs; - outputs.reserve(fusion->outputs().size()); - NVF_ERROR( - args.size() == fusion->inputs().size(), - "fusion arguments length does not match runtime arguments."); - for (const auto out_i : c10::irange(fusion->outputs().size())) { - auto out_val = fusion->outputs()[out_i]; + std::vector output_buffer_infos; + output_buffer_infos.reserve(fusion_outputs.size()); + for (const auto out : fusion_outputs) { NVF_ERROR( - out_val->isA(), + out->isA(), "Cannot allocate outputs that are not tensors."); - FusionExecutor::GlobalBufferInfo info; - info.tv = out_val->as(); - std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval); - auto dtype = - (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); - info.type = data_type_to_aten(dtype); - - outputs.emplace_back(info); + output_buffer_infos.emplace_back( + getBufferInfo(expr_eval, index_dtype, out->as())); } - return outputs; + return output_buffer_infos; } } // namespace @@ -1395,7 +1397,7 @@ std::vector allocOutputSpace( auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion); auto output_info = - getOutputBufferInfo(fusion_inputs, expr_eval, PrimDataType::Int, fusion); + getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs()); return allocateOutputs(fusion, output_info, device, expr_eval); } @@ -1427,8 +1429,8 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes( auto arg_index_type = args.getSmallestIndexTypeOfArguments(); - KernelArgumentHolder ret; - ret.setDeviceIndex(args.getDeviceIndex()); + KernelArgumentHolder output_tensor_proxies; + output_tensor_proxies.setDeviceIndex(args.getDeviceIndex()); for (Val* output : fusion->outputs()) { NVF_ERROR( @@ -1439,9 +1441,9 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes( const auto dtype = (output_tv->dtype() == DataType::Index) ? data_type_to_aten(arg_index_type) : data_type_to_aten(output_tv->dtype()); - ret.pushTensorProxy(sizes, strides, dtype); + output_tensor_proxies.pushTensorProxy(sizes, strides, dtype); } - return ret; + return output_tensor_proxies; } namespace { @@ -1553,15 +1555,6 @@ void dumpKernelArgs( } } -FusionExecutor::GlobalBufferInfo getGlobalBufferAllocationInfo( - const at::Tensor& at_tensor) { - FusionExecutor::GlobalBufferInfo info{ - .sizes = at_tensor.sizes().vec(), - .strides = at_tensor.strides().vec(), - .type = at_tensor.scalar_type()}; - return info; -} - } // namespace void FusionExecutor::initializeExecutorEntry( @@ -1591,13 +1584,16 @@ void FusionExecutor::initializeExecutorEntry( if (outputs.empty()) { output_info = - getOutputBufferInfo(args, expr_eval, index_type, lowered_->kernel()); + getBufferInfos(expr_eval, index_type, lowered_->kernel()->outputs()); } else { // Need to save the information necessary for allocations as // future uses of this ExecutorEntry may not be provided with // allocated outputs for (const auto& output : outputs) { - output_info.emplace_back(getGlobalBufferAllocationInfo(output)); + output_info.emplace_back(FusionExecutor::GlobalBufferInfo{ + .sizes = output.sizes().vec(), + .strides = output.strides().vec(), + .type = output.scalar_type()}); } } @@ -1853,10 +1849,13 @@ void FusionExecutor::resetCompiledKernelProperties() { } std::vector FusionExecutor::evaluateFusionOutputs( - KernelArgumentHolder& args, std::vector outputs, ExpressionEvaluator& expr_eval) { - // TODO: Add relevant profiling code. + FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs"); + NVF_ERROR( + outputs.empty(), + "Fusion executor is using expression evaluator,", + " and expects that the outputs are not populated, which they were."); if (outputs.empty()) { for (const auto& out_val : fusion()->outputs()) { auto out_tensor = @@ -1864,11 +1863,12 @@ std::vector FusionExecutor::evaluateFusionOutputs( outputs.emplace_back(out_tensor); } } - args.push(outputs); return outputs; } namespace { +// Host IR specific function, returns the at:Tensor (ordered list) associated +// with the provdied Fusion output tv at::Tensor findBufferForFusionOutput( const std::vector& out_tensors, const Val* fusion_out, @@ -1906,14 +1906,10 @@ std::vector FusionExecutor::runFusion( " provided number of outputs does not match fusion output"); // Bind fusion inputs - ExpressionEvaluator expr_eval; - const auto& inputs = fusion()->inputs(); - for (const auto i : c10::irange(inputs.size())) { - expr_eval.bind(inputs[i], *args[i]); - } + auto expr_eval = executor_utils::bindInputs(args, fusion()); if (isExpressionEvaluated(fusion())) { - outputs = evaluateFusionOutputs(args, outputs, expr_eval); + outputs = evaluateFusionOutputs(outputs, expr_eval); if (isProfilerEnabled()) { auto& sprof = FusionProfiler::segment(group_id_); sprof.stopKernel(); @@ -1924,8 +1920,8 @@ std::vector FusionExecutor::runFusion( if (host_ir_container_ != nullptr) { if (outputs.empty()) { - std::vector output_info = getOutputBufferInfo( - args, expr_eval, PrimDataType::Int, host_ir_container_.get()); + std::vector output_info = getBufferInfos( + expr_eval, PrimDataType::Int, host_ir_container_.get()->outputs()); outputs = allocateOutputs( host_ir_container_.get(), output_info, options_.device, expr_eval); } @@ -2012,7 +2008,7 @@ std::vector FusionExecutor::runFusion( // Skip trivially forwarded outputs because they are just placeholders continue; } - expr_eval.bind(output, *args[inputs.size() + i]); + expr_eval.bind(output, *args[kernel()->inputs().size() + i]); } std::vector intermediates; @@ -2066,7 +2062,7 @@ std::vector FusionExecutor::runFusion( intermediates.push_back(intermediate_buffer); expr_eval.bind( kernel()->summary().global_allocations.at(i)->buffer(), - *args[inputs.size() + outputs.size() + i]); + *args[kernel()->inputs().size() + outputs.size() + i]); if (buf_info.is_profile_buffer) { profile_buffer = intermediate_buffer; } diff --git a/csrc/executor.h b/csrc/executor.h index 97710667553..4817c64bb92 100644 --- a/csrc/executor.h +++ b/csrc/executor.h @@ -65,7 +65,7 @@ class FusionExecutor : public NonCopyable { //! Notes: 1. This API should ignore aliased outputs instead of //! pushing scalar int 0 as a place-holder. //! 2. This API does not allocate output in memory, but only returns the - //! inferred output sizes. + //! inferred output sizes. Used in kernel_cache.cpp. KernelArgumentHolder inferOutputSizes( Fusion* fusion, const KernelArgumentHolder& args, @@ -118,10 +118,14 @@ class FusionExecutor : public NonCopyable { //! Computes fusion outputs through expression evaluator. std::vector evaluateFusionOutputs( - KernelArgumentHolder& args, std::vector outputs, ExpressionEvaluator& expr_eval); + // TODO: args shouldn't come in a reference here because we will append the + // outputs to be able to send it to the kernel. For now none of the users are + // reconsuming the args, so it is okay. It isn't done now because changing it + // from a reference makes a call as runFusion({}) ambiguous, and that is used + // in some places in the codebase. NVF_API std::vector runFusion( KernelArgumentHolder& args, const LaunchParams& launch_constraints = LaunchParams(), diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 752bcf73099..b12cf4134c4 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -705,7 +705,18 @@ ExpressionEvaluator bindInputs( // NOTE: we bind all inputs here, including at::Tensors. This means that // expr_eval will create a PolymorphicValue containing *args[i], which means // that at::Tensor's lifetime will be at least as long as that of expr_eval. - expr_eval.bind(inputs[i], *args[i], true); + try { + expr_eval.bind(inputs[i], *args[i], true); + } catch (const nvfError& e) { + std::stringstream ss; + ss << "When trying to run the provided host program," + << " there was an error with the provided input " << i + << ". Provided input was:\n "; + ss << PolymorphicValue_functions::toString(*args[i]); + ss << "\n which does not match the expected input:\n "; + ss << inputs[i]->toString() << "\n"; + NVF_THROW(ss.str()); + } } return expr_eval; diff --git a/csrc/kernel_cache.h b/csrc/kernel_cache.h index 6562cc7fb4a..c4f680133e0 100644 --- a/csrc/kernel_cache.h +++ b/csrc/kernel_cache.h @@ -670,13 +670,6 @@ class FusionExecutorCache { //! Deserialize Fusion Executor Cache using flatbuffers void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id); - //! Allocate the outputs of the Fusion given inputs - //! TODO: re-implement - std::vector allocOutputSpace( - const at::ArrayRef& inputs) { - return runFusionWithInputs(inputs); - } - private: //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached //! entry in `FusionExecutor` diff --git a/csrc/utils.cpp b/csrc/utils.cpp index 2c372b95577..c3376148bc0 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -37,86 +37,47 @@ c10::ThreadPool* getThreadPool() { return &pool; } -C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") -void debugPrint(const c10::TensorTypePtr& type) { - std::stringstream sizes_s; - if (auto sizes = type->symbolic_sizes().sizes()) { - for (const auto& shape_symbol : *sizes) { - if (shape_symbol.is_static()) { - sizes_s << shape_symbol.static_size() << ", "; - } else { - sizes_s << "s(" << *reinterpret_cast(&shape_symbol) - << "), "; - } - } - } else { - sizes_s << "no size available"; - } - debug() << "sizes:" << sizes_s.str() << std::endl; - if (const auto& stride_properties = type->stride_properties().sizes()) { - std::stringstream stride_s; - std::stringstream index_s; - std::stringstream contig_s; - - for (const auto& stride_property : *stride_properties) { - if (stride_property.has_value() && stride_property->stride_.has_value()) { - stride_s << *stride_property->stride_ << ", "; - } else { - stride_s << "?, "; - } - if (stride_property.has_value() && - stride_property->stride_index_.has_value()) { - index_s << *stride_property->stride_index_ << ", "; - } else { - index_s << "?, "; - } - if (stride_property.has_value() && - stride_property->contiguous_.has_value()) { - contig_s << *stride_property->contiguous_ << ", "; - } else { - contig_s << "?, "; - } - } - debug() << "stride: " << stride_s.str() << std::endl; - debug() << "stride index: " << index_s.str() << std::endl; - debug() << "contiguous: " << contig_s.str() << std::endl; - } else { - debug() << "no stride properties available" << std::endl; +std::string debug_str(const c10::IValue& val) { + if (val.isTensor()) { + return debug_str(val.toTensor()); } -} -C10_DIAGNOSTIC_POP() -bool is_zero_dim_tensor(const std::shared_ptr& tensor_type) { - return tensor_type && tensor_type->dim().has_value() && - tensor_type->dim().value() == 0; + std::stringstream out_s; + out_s << val.toScalar(); + return out_s.str(); } -bool is_zero_sized_tensor(const std::shared_ptr& tensor_type) { - auto opt_sizes = tensor_type->sizes().concrete_sizes(); - if (opt_sizes.has_value()) { - auto sizes = opt_sizes.value(); - for (const auto& size : sizes) { - if (size == 0) { - return true; +std::string debug_str(const at::Tensor& tensor) { + std::stringstream ss; + ss << "Tensor:"; + ss << " device: " << tensor.device(); + ss << ", dtype: " << tensor.dtype(); + ss << ", shape["; + for (size_t i = 0; i < tensor.sizes().size(); ++i) { + if (i != 0) { + ss << ", "; + } + ss << tensor.size(i); + } + ss << "]"; + + if (!tensor.is_contiguous()) { + ss << ", strides: ["; + for (size_t i = 0; i < tensor.strides().size(); ++i) { + ss << tensor.stride(i); + if (i < tensor.strides().size() - 1) { + ss << ", "; } } + ss << "]"; } - return false; + return ss.str(); } bool is_cpu_scalar(const at::Tensor& tensor) { return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0; } -bool is_cpu_scalar(const c10::TensorType& tensor_type) { - auto opt_device = tensor_type.device(); - auto opt_dim = tensor_type.dim(); - auto opt_numel = tensor_type.numel(); - return opt_device.has_value() && opt_device->is_cpu() && - opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 && - opt_numel.value() == 1; -} - bool is_meta_scalar(const at::Tensor& tensor) { return tensor.device().is_meta() && tensor.numel() == 1 && tensor.dim() == 0; } diff --git a/csrc/utils.h b/csrc/utils.h index 321214c9857..44fbd4057d0 100644 --- a/csrc/utils.h +++ b/csrc/utils.h @@ -52,13 +52,10 @@ namespace nvfuser { int getNumThreads(); c10::ThreadPool* getThreadPool(); -void debugPrint(const c10::TensorTypePtr& type); - -bool is_zero_dim_tensor(const std::shared_ptr& tensor_type); -bool is_zero_sized_tensor(const std::shared_ptr& tensor_type); +std::string debug_str(const c10::IValue& val); +std::string debug_str(const at::Tensor& tensor); bool is_cpu_scalar(const at::Tensor& tensor); -bool is_cpu_scalar(const c10::TensorType& tensor_type); bool is_meta_scalar(const at::Tensor& tensor); From 11df23aab724ecc8394e54ed0643e0556427d749 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 3 Aug 2024 18:00:47 -0400 Subject: [PATCH 2/7] Apply suggestions from code review Cleanup printing. Co-authored-by: Jingyue Wu --- csrc/executor.cpp | 2 +- csrc/utils.cpp | 18 ++---------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/csrc/executor.cpp b/csrc/executor.cpp index a9768bcb1a0..b87edd51f66 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1921,7 +1921,7 @@ std::vector FusionExecutor::runFusion( if (host_ir_container_ != nullptr) { if (outputs.empty()) { std::vector output_info = getBufferInfos( - expr_eval, PrimDataType::Int, host_ir_container_.get()->outputs()); + expr_eval, PrimDataType::Int, host_ir_container_->outputs()); outputs = allocateOutputs( host_ir_container_.get(), output_info, options_.device, expr_eval); } diff --git a/csrc/utils.cpp b/csrc/utils.cpp index c3376148bc0..4dcf877c1e9 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -52,24 +52,10 @@ std::string debug_str(const at::Tensor& tensor) { ss << "Tensor:"; ss << " device: " << tensor.device(); ss << ", dtype: " << tensor.dtype(); - ss << ", shape["; - for (size_t i = 0; i < tensor.sizes().size(); ++i) { - if (i != 0) { - ss << ", "; - } - ss << tensor.size(i); - } - ss << "]"; + ss << ", shape: " << tensor.sizes(); if (!tensor.is_contiguous()) { - ss << ", strides: ["; - for (size_t i = 0; i < tensor.strides().size(); ++i) { - ss << tensor.stride(i); - if (i < tensor.strides().size() - 1) { - ss << ", "; - } - } - ss << "]"; + ss << ", strides: " << tensor.strides(); } return ss.str(); } From 994315260a747d7341ded55a572409cd117f9259 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 28 Aug 2024 14:18:06 -0700 Subject: [PATCH 3/7] Improve error message in expression evaluator when propagating constant sizes through the exact graph. --- csrc/executor_utils.cpp | 6 +- csrc/expr_evaluator.cpp | 148 ++++++++++++++++++++++-- tests/cpp/test_gpu3.cpp | 4 +- tests/python/opinfo_input_generators.py | 8 +- 4 files changed, 152 insertions(+), 14 deletions(-) diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 0e659d1d11d..cac3ba928c0 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -724,8 +724,10 @@ ExpressionEvaluator bindInputs( << " there was an error with the provided input " << i << ". Provided input was:\n "; ss << PolymorphicValue_functions::toString(*args[i]); - ss << "\n which does not match the expected input:\n "; - ss << inputs[i]->toString() << "\n"; + ss << "\n Fusion input is:\n "; + ss << inputs[i]->toString(); + ss << "\n Expr eval provided the error:\n\"\"\""; + ss << e.msg() << "\"\"\"\n"; NVF_THROW(ss.str()); } } diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index c59842b2037..81397f634bd 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -344,6 +344,143 @@ void ExpressionEvaluator::print() const { debug() << "--------------------\n\n"; } +namespace { +// Error handling for +// ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) +void handlePropagateError( + Fusion* fusion, + ExpressionEvaluator* expr_eval, + std::shared_ptr> id_set) { + std::unordered_map id_to_size; + std::set sizes; + + for (const auto id : *id_set) { + auto eval_val = expr_eval->evaluate(id->extent()); + if (eval_val.hasValue()) { + NVF_ERROR( + eval_val.is(), + "Invalid extent value found, while processing ID: ", + id); + id_to_size[id] = eval_val.as(); + sizes.insert(eval_val.as()); + } + } + + std::stringstream err_msg; + err_msg + << "When trying to propagate constant tensor sizes through the graph a conflict was found with " + << sizes.size() + << " different sizes across dimensions that are expected to match.\n"; + + // Track which size is associated with which TV and IterDomain + std::unordered_map< + int64_t, + std::vector>> + size_to_info; + + for (auto tv : ir_utils::allTvs(fusion)) { + for (const IterDomain* id : tv->domain()->allIDs()) { + if (auto it = id_to_size.find(id); it != id_to_size.end()) { + size_to_info[it->second].push_back({tv, id}); + } + } + } + + // Check which TensorViews mismatch and check if they're directly related. + // If so, the expression between them may be the problmeatic expression, and + // the error will point to that expression(s). Don't bother to simplify this + // check as it's only runs after an error is detected. + bool found_producer_consumer_issue = false; + // Assume producer/consumer relationship + for (auto [dim_size_1, tv_id_pairs_1] : size_to_info) { + for (auto [dim_size_2, tv_id_pairs_2] : size_to_info) { + if (dim_size_1 <= dim_size_2) { + // N^2 algorithm, only process when one size is less than the other, + // avoids duplicate entries. + continue; + } + + bool tv1_is_consumer = false; + for (auto tv_id_pair_1 : tv_id_pairs_1) { + for (auto tv_id_pair_2 : tv_id_pairs_2) { + // Check for producer-consumer relationship + auto producer_tvs_of_tv1 = + ir_utils::producerTvsOf(tv_id_pair_1.first); + auto producer_tvs_of_tv2 = + ir_utils::producerTvsOf(tv_id_pair_2.first); + if (std::find( + producer_tvs_of_tv1.begin(), + producer_tvs_of_tv1.end(), + tv_id_pair_2.first) != producer_tvs_of_tv1.end()) { + tv1_is_consumer = true; + } else if ( + std::find( + producer_tvs_of_tv2.begin(), + producer_tvs_of_tv2.end(), + tv_id_pair_1.first) == producer_tvs_of_tv2.end()) { + // No relationship found, skip + continue; + } + + Expr* relationship = tv1_is_consumer + ? tv_id_pair_1.first->definition() + : tv_id_pair_2.first->definition(); + + // Found at least one consumer/producer relationship with mismatched + // sizes. + found_producer_consumer_issue = true; + + // Produce error message. Normally I'd just use swap but keys in an + // unordered map are const, so doing some juggling with the strings + // instead. + std::stringstream tv1_error; + std::stringstream tv2_error; + tv1_error << " TV: " << tv_id_pair_1.first + << " id: " << tv_id_pair_1.second + << " found size: " << dim_size_1 << "\n"; + tv2_error << " TV: " << tv_id_pair_2.first + << " id: " << tv_id_pair_2.second + << " found size: " << dim_size_2 << "\n"; + err_msg << " For Producer" + << (tv1_is_consumer ? tv2_error.str() : tv1_error.str()); + err_msg << " For Consumer" + << (tv1_is_consumer ? tv1_error.str() : tv2_error.str()); + err_msg + << " With producer-consumer relationship through the expression: " + << relationship << "\n"; + } + } + } + } + + if (found_producer_consumer_issue) { + NVF_THROW(err_msg.str()); + } + + if (size_to_info.size() > 1) { + for (const auto& [size, info_pairs] : size_to_info) { + err_msg << "Size " << size << " found for ID, in TV:\n"; + for (auto info_pair : info_pairs) { + err_msg << " " << info_pair.second << ", " << info_pair.first << "\n"; + } + } + err_msg << "These sizes should all match.\n"; + } else { + std::unordered_map> size_to_ids; + err_msg + << "The following groups of IterDomains should all have the same size, but don't:\n"; + for (const auto& [id, size] : id_to_size) { + size_to_ids[size].push_back(id); + } + for (const auto& [size, ids] : size_to_ids) { + err_msg << " Size: " << size << " found for ids: " << ids << "\n"; + } + } + + NVF_THROW(err_msg.str()); +} +} // namespace + void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) { // We map Symbolic IterDomains here only if their extents match. This avoids // mapping between symbolic domains that might concretize to an (Iteration, @@ -358,15 +495,10 @@ void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) { if (eval_val.hasValue()) { NVF_ERROR(eval_val.is(), "Invalid extent value"); int64_t this_size = eval_val.as(); - if (known_size != -1) { - NVF_ERROR( - known_size == this_size, - "Conflicting sizes: ", - known_size, - ", ", - this_size); - } else { + if (known_size == -1) { known_size = this_size; + } else if (known_size != this_size) { + handlePropagateError(fusion, this, set); } } else { unknown_vals.push_back(id->extent()); diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 4cc1ac113f4..710cdab9a64 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -3839,8 +3839,8 @@ TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) { { ASSERT_THAT( [&]() { matched_add(a, c); }, - ::testing::ThrowsMessage( - ::testing::HasSubstr("Conflicting sizes"))); + ::testing::ThrowsMessage(::testing::HasSubstr( + "When trying to propagate constant tensor sizes through the graph a conflict was found with 2 different sizes across dimensions that are expected to match."))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } } diff --git a/tests/python/opinfo_input_generators.py b/tests/python/opinfo_input_generators.py index 779f84bfb3b..a3e7551116b 100644 --- a/tests/python/opinfo_input_generators.py +++ b/tests/python/opinfo_input_generators.py @@ -311,7 +311,11 @@ def cat_error_generator(op, dtype=torch.float32, requires_grad: bool = False, ** "Unexpected number of dimensions", ) # All tensors must have same shape except for the cat dimension - shape_mismatch = (([(2, 3), (4, 5)], 0), RuntimeError, "known_size == this_size") + shape_mismatch = ( + ([(2, 3), (4, 5)], 0), + RuntimeError, + "When trying to propagate constant tensor sizes", + ) error_cases = [ empty_input_tensors, @@ -898,7 +902,7 @@ def pad_error_generator( delete_all_pad_width = [-3, 0, 0, 0] yield SampleInput( make_arg(input_shape), delete_all_pad_width, make_number(dtype) - ), RuntimeError, "extent_int >= 0" + ), RuntimeError, "Invalid resized domain extent" too_many_pad_width = [1, 1, 1, 1, 1, 1] yield SampleInput( From 97ef12195483ccf80b141b44377f0a1cc65ea9b9 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 31 Aug 2024 16:49:12 -0400 Subject: [PATCH 4/7] Apply suggestions from code review Typos Co-authored-by: Jingyue Wu --- csrc/expr_evaluator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 81397f634bd..9d01761c974 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -387,9 +387,9 @@ void handlePropagateError( } // Check which TensorViews mismatch and check if they're directly related. - // If so, the expression between them may be the problmeatic expression, and - // the error will point to that expression(s). Don't bother to simplify this - // check as it's only runs after an error is detected. + // If so, the expression between them may be the problematic expression, and + // the error will point to that expression(s). Don't bother to speed up this + // check as it only runs after an error is detected. bool found_producer_consumer_issue = false; // Assume producer/consumer relationship for (auto [dim_size_1, tv_id_pairs_1] : size_to_info) { From 9bc2af8fda72454f2486d0f4c094506d05827ea1 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 2 Sep 2024 14:13:38 -0700 Subject: [PATCH 5/7] PR Feedback. --- csrc/expr_evaluator.cpp | 45 +++++++++++-------------- tests/cpp/test_alias.cpp | 2 ++ tests/python/opinfo_input_generators.py | 2 +- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 441adb6c62a..0832d60a54a 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -358,7 +358,7 @@ namespace { void handlePropagateError( Fusion* fusion, ExpressionEvaluator* expr_eval, - std::shared_ptr> id_set) { + const std::shared_ptr>& id_set) { std::unordered_map id_to_size; std::set sizes; @@ -408,31 +408,29 @@ void handlePropagateError( continue; } - bool tv1_is_consumer = false; - for (auto tv_id_pair_1 : tv_id_pairs_1) { - for (auto tv_id_pair_2 : tv_id_pairs_2) { + for (const auto& [tv1, id1] : tv_id_pairs_1) { + for (const auto& [tv2, id2] : tv_id_pairs_2) { + bool tv1_is_consumer = false; + // Check for producer-consumer relationship - auto producer_tvs_of_tv1 = - ir_utils::producerTvsOf(tv_id_pair_1.first); - auto producer_tvs_of_tv2 = - ir_utils::producerTvsOf(tv_id_pair_2.first); + auto producer_tvs_of_tv1 = ir_utils::producerTvsOf(tv1); + auto producer_tvs_of_tv2 = ir_utils::producerTvsOf(tv2); if (std::find( producer_tvs_of_tv1.begin(), producer_tvs_of_tv1.end(), - tv_id_pair_2.first) != producer_tvs_of_tv1.end()) { + tv2) != producer_tvs_of_tv1.end()) { tv1_is_consumer = true; } else if ( std::find( producer_tvs_of_tv2.begin(), producer_tvs_of_tv2.end(), - tv_id_pair_1.first) == producer_tvs_of_tv2.end()) { + tv1) == producer_tvs_of_tv2.end()) { // No relationship found, skip continue; } - Expr* relationship = tv1_is_consumer - ? tv_id_pair_1.first->definition() - : tv_id_pair_2.first->definition(); + Expr* relationship = + tv1_is_consumer ? tv1->definition() : tv2->definition(); // Found at least one consumer/producer relationship with mismatched // sizes. @@ -443,11 +441,9 @@ void handlePropagateError( // instead. std::stringstream tv1_error; std::stringstream tv2_error; - tv1_error << " TV: " << tv_id_pair_1.first - << " id: " << tv_id_pair_1.second + tv1_error << " TV: " << tv1 << " id: " << id1 << " found size: " << dim_size_1 << "\n"; - tv2_error << " TV: " << tv_id_pair_2.first - << " id: " << tv_id_pair_2.second + tv2_error << " TV: " << tv2 << " id: " << id2 << " found size: " << dim_size_2 << "\n"; err_msg << " For Producer" << (tv1_is_consumer ? tv2_error.str() : tv1_error.str()); @@ -474,22 +470,19 @@ void handlePropagateError( } err_msg << "These sizes should all match.\n"; } else { - std::unordered_map> size_to_ids; err_msg - << "The following groups of IterDomains should all have the same size, but don't:\n"; - for (const auto& [id, size] : id_to_size) { - size_to_ids[size].push_back(id); - } - for (const auto& [size, ids] : size_to_ids) { - err_msg << " Size: " << size << " found for ids: " << ids << "\n"; - } + << "Something went wrong trying to detect what went wrong!" + << " There should have been ID's in TVs that should match, but don't." + << " Somehow IDs were registered with the exact graph that aren't used in the Fusion." + << std::endl; } NVF_THROW(err_msg.str()); } } // namespace -void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion, +void ExpressionEvaluator::propagateBoundValuesThroughExactMaps( + Fusion* fusion, ExactLogicalDomainMap* exact_map) { // We map Symbolic IterDomains here only if their extents match. This avoids // mapping between symbolic domains that might concretize to an (Iteration, diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp index b68b3678023..0a8a427a6b8 100644 --- a/tests/cpp/test_alias.cpp +++ b/tests/cpp/test_alias.cpp @@ -838,6 +838,7 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) { FusionGuard fg(fusion.get()); TensorView* in = TensorViewBuilder() + .ndims(3) .dtype(DataType::Float) .contiguity({std::nullopt, std::nullopt, std::nullopt}) .shape({4, 5, 6}) @@ -861,6 +862,7 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) { FusionGuard fg(fusion.get()); TensorView* in = TensorViewBuilder() + .ndims(4) .dtype(DataType::Float) .contiguity({true, std::nullopt, std::nullopt, true}) .shape({2, 3, 5, 7}) diff --git a/tests/python/opinfo_input_generators.py b/tests/python/opinfo_input_generators.py index a3e7551116b..46ec89f40ab 100644 --- a/tests/python/opinfo_input_generators.py +++ b/tests/python/opinfo_input_generators.py @@ -314,7 +314,7 @@ def cat_error_generator(op, dtype=torch.float32, requires_grad: bool = False, ** shape_mismatch = ( ([(2, 3), (4, 5)], 0), RuntimeError, - "When trying to propagate constant tensor sizes", + "Tried to bind to a value", ) error_cases = [ From 1db8e06fa68176b69b23329338e511a1e4606e4e Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Fri, 6 Sep 2024 13:12:21 -0700 Subject: [PATCH 6/7] resolve bad conflict resolution. --- tests/cpp/test_alias.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp index 0a8a427a6b8..b68b3678023 100644 --- a/tests/cpp/test_alias.cpp +++ b/tests/cpp/test_alias.cpp @@ -838,7 +838,6 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) { FusionGuard fg(fusion.get()); TensorView* in = TensorViewBuilder() - .ndims(3) .dtype(DataType::Float) .contiguity({std::nullopt, std::nullopt, std::nullopt}) .shape({4, 5, 6}) @@ -862,7 +861,6 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) { FusionGuard fg(fusion.get()); TensorView* in = TensorViewBuilder() - .ndims(4) .dtype(DataType::Float) .contiguity({true, std::nullopt, std::nullopt, true}) .shape({2, 3, 5, 7}) From 8f3ff98516b7a5d60cb3edf788ba42d4959677d2 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Fri, 6 Sep 2024 13:23:58 -0700 Subject: [PATCH 7/7] Minor fix. --- csrc/expr_evaluator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 3b7f2b382a0..0fd62022098 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -386,7 +386,7 @@ void handlePropagateError( std::vector>> size_to_info; - for (auto tv : ir_utils::allTvs(fusion)) { + for (auto tv : fusion->allTvs()) { for (const IterDomain* id : tv->domain()->allIDs()) { if (auto it = id_to_size.find(id); it != id_to_size.end()) { size_to_info[it->second].push_back({tv, id});