diff --git a/csrc/exceptions.h b/csrc/exceptions.h index 8084a1191fb..b472b281b83 100644 --- a/csrc/exceptions.h +++ b/csrc/exceptions.h @@ -253,17 +253,20 @@ inline const char* nvfCheckMsgImpl(const char* /*msg*/, const char* args) { #define STRINGIZE_IMPL(x) #x #define STRINGIZE(x) STRINGIZE_IMPL(x) -#define NVF_ERROR(cond, ...) \ - if ((!(cond))) { \ - nvfuser::nvfErrorFail( \ +#define NVF_THROW(...) \ + nvfuser::nvfErrorFail( \ __FUNCTION__, \ __FILE__, \ static_cast(__LINE__), \ - #cond " INTERNAL ASSERT FAILED at " \ - STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \ + " INTERNAL ASSERT FAILED at " \ + STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \ ", please report a bug with repro script to NVFuser at " \ - "https://github.com/NVIDIA/Fuser/issues. ", \ - nvfuser::to_str(__VA_ARGS__)); \ + "https://github.com/NVIDIA/Fuser/issues. ", \ + nvfuser::to_str(__VA_ARGS__)); + +#define NVF_ERROR(cond, ...) \ + if ((!(cond))) { \ + NVF_THROW(__VA_ARGS__) \ } #define NVF_CHECK_MSG(cond, type, ...) \ diff --git a/csrc/executor.cpp b/csrc/executor.cpp index f9f8bf7ba5c..ca1d5403891 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1348,35 +1348,38 @@ std::vector FusionExecutor:: namespace { +FusionExecutor::GlobalBufferInfo getBufferInfo( + ExpressionEvaluator& expr_eval, + DataType index_dtype, + TensorView* tv) { + FusionExecutor::GlobalBufferInfo info; + info.tv = tv; + std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval); + auto dtype = + (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); + info.type = data_type_to_aten(dtype); + return info; +} + //! Return information necessary for allocating output tensors. Input //! and output tensors are allowed to alias each other, which is //! specified by the list of int pairs of input and output indices -std::vector getOutputBufferInfo( - const KernelArgumentHolder& args, +std::vector getBufferInfos( ExpressionEvaluator& expr_eval, DataType index_dtype, - const Fusion* fusion) { - std::vector outputs; - outputs.reserve(fusion->outputs().size()); - NVF_ERROR( - args.size() == fusion->inputs().size(), - "fusion arguments length does not match runtime arguments."); - for (const auto out_i : c10::irange(fusion->outputs().size())) { - auto out_val = fusion->outputs()[out_i]; + const std::vector& fusion_outputs) { + FUSER_PERF_SCOPE("FusionExecutor::getOutbufferInfo"); + std::vector output_buffer_infos; + output_buffer_infos.reserve(fusion_outputs.size()); + for (const auto out : fusion_outputs) { NVF_ERROR( - out_val->isA(), + out->isA(), "Cannot allocate outputs that are not tensors."); - FusionExecutor::GlobalBufferInfo info; - info.tv = out_val->as(); - std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval); - auto dtype = - (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); - info.type = data_type_to_aten(dtype); - - outputs.emplace_back(info); + output_buffer_infos.emplace_back( + getBufferInfo(expr_eval, index_dtype, out->as())); } - return outputs; + return output_buffer_infos; } } // namespace @@ -1389,7 +1392,7 @@ std::vector allocOutputSpace( auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion); auto output_info = - getOutputBufferInfo(fusion_inputs, expr_eval, PrimDataType::Int, fusion); + getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs()); return allocateOutputs(fusion, output_info, device, expr_eval); } @@ -1421,8 +1424,8 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes( auto arg_index_type = args.getSmallestIndexTypeOfArguments(); - KernelArgumentHolder ret; - ret.setDeviceIndex(args.getDeviceIndex()); + KernelArgumentHolder output_tensor_proxies; + output_tensor_proxies.setDeviceIndex(args.getDeviceIndex()); for (Val* output : fusion->outputs()) { NVF_ERROR( @@ -1433,9 +1436,9 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes( const auto dtype = (output_tv->dtype() == DataType::Index) ? data_type_to_aten(arg_index_type) : data_type_to_aten(output_tv->dtype()); - ret.pushTensorProxy(sizes, strides, dtype); + output_tensor_proxies.pushTensorProxy(sizes, strides, dtype); } - return ret; + return output_tensor_proxies; } namespace { @@ -1547,15 +1550,6 @@ void dumpKernelArgs( } } -FusionExecutor::GlobalBufferInfo getGlobalBufferAllocationInfo( - const at::Tensor& at_tensor) { - FusionExecutor::GlobalBufferInfo info{ - .sizes = at_tensor.sizes().vec(), - .strides = at_tensor.strides().vec(), - .type = at_tensor.scalar_type()}; - return info; -} - } // namespace void FusionExecutor::initializeExecutorEntry( @@ -1587,13 +1581,16 @@ void FusionExecutor::initializeExecutorEntry( if (outputs.empty()) { output_info = - getOutputBufferInfo(args, expr_eval, index_type, lowered_->kernel()); + getBufferInfos(expr_eval, index_type, lowered_->kernel()->outputs()); } else { // Need to save the information necessary for allocations as // future uses of this ExecutorEntry may not be provided with // allocated outputs for (const auto& output : outputs) { - output_info.emplace_back(getGlobalBufferAllocationInfo(output)); + output_info.emplace_back(FusionExecutor::GlobalBufferInfo{ + .sizes = output.sizes().vec(), + .strides = output.strides().vec(), + .type = output.scalar_type()}); } } @@ -1849,10 +1846,13 @@ void FusionExecutor::resetCompiledKernelProperties() { } std::vector FusionExecutor::evaluateFusionOutputs( - KernelArgumentHolder& args, std::vector outputs, ExpressionEvaluator& expr_eval) { - // TODO: Add relevant profiling code. + FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs"); + NVF_ERROR( + outputs.empty(), + "Fusion executor is using expression evaluator,", + " and expects that the outputs are not populated, which they were."); if (outputs.empty()) { for (const auto& out_val : fusion()->outputs()) { auto out_tensor = @@ -1861,11 +1861,12 @@ std::vector FusionExecutor::evaluateFusionOutputs( outputs.emplace_back(out_tensor); } } - args.push(outputs); return outputs; } namespace { +// Host IR specific function, returns the at:Tensor (ordered list) associated +// with the provdied Fusion output tv at::Tensor findBufferForFusionOutput( const std::vector& out_tensors, const Val* fusion_out, @@ -1903,14 +1904,10 @@ std::vector FusionExecutor::runFusion( " provided number of outputs does not match fusion output"); // Bind fusion inputs - ExpressionEvaluator expr_eval; - const auto& inputs = fusion()->inputs(); - for (const auto i : c10::irange(inputs.size())) { - expr_eval.bind(inputs[i], *args[i]); - } + auto expr_eval = executor_utils::bindInputs(args, fusion()); if (isExpressionEvaluated(fusion())) { - outputs = evaluateFusionOutputs(args, outputs, expr_eval); + outputs = evaluateFusionOutputs(outputs, expr_eval); if (isProfilerEnabled()) { auto& sprof = FusionProfiler::segment(group_id_); sprof.stopKernel(); @@ -1921,8 +1918,8 @@ std::vector FusionExecutor::runFusion( if (host_ir_container_ != nullptr) { if (outputs.empty()) { - std::vector output_info = getOutputBufferInfo( - args, expr_eval, PrimDataType::Int, host_ir_container_.get()); + std::vector output_info = getBufferInfos( + expr_eval, PrimDataType::Int, host_ir_container_->outputs()); outputs = allocateOutputs( host_ir_container_.get(), output_info, options_.device, expr_eval); } @@ -2009,7 +2006,7 @@ std::vector FusionExecutor::runFusion( // Skip trivially forwarded outputs because they are just placeholders continue; } - expr_eval.bind(output, *args[inputs.size() + i]); + expr_eval.bind(output, *args[kernel()->inputs().size() + i]); } std::vector intermediates; @@ -2062,7 +2059,7 @@ std::vector FusionExecutor::runFusion( intermediates.push_back(intermediate_buffer); expr_eval.bind( kernel()->summary().global_allocations.at(i)->buffer(), - *args[inputs.size() + outputs.size() + i]); + *args[kernel()->inputs().size() + outputs.size() + i]); if (buf_info.is_profile_buffer) { profile_buffer = intermediate_buffer; } diff --git a/csrc/executor.h b/csrc/executor.h index 97710667553..4817c64bb92 100644 --- a/csrc/executor.h +++ b/csrc/executor.h @@ -65,7 +65,7 @@ class FusionExecutor : public NonCopyable { //! Notes: 1. This API should ignore aliased outputs instead of //! pushing scalar int 0 as a place-holder. //! 2. This API does not allocate output in memory, but only returns the - //! inferred output sizes. + //! inferred output sizes. Used in kernel_cache.cpp. KernelArgumentHolder inferOutputSizes( Fusion* fusion, const KernelArgumentHolder& args, @@ -118,10 +118,14 @@ class FusionExecutor : public NonCopyable { //! Computes fusion outputs through expression evaluator. std::vector evaluateFusionOutputs( - KernelArgumentHolder& args, std::vector outputs, ExpressionEvaluator& expr_eval); + // TODO: args shouldn't come in a reference here because we will append the + // outputs to be able to send it to the kernel. For now none of the users are + // reconsuming the args, so it is okay. It isn't done now because changing it + // from a reference makes a call as runFusion({}) ambiguous, and that is used + // in some places in the codebase. NVF_API std::vector runFusion( KernelArgumentHolder& args, const LaunchParams& launch_constraints = LaunchParams(), diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 740981972dc..cac3ba928c0 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -716,7 +716,20 @@ ExpressionEvaluator bindInputs( // NOTE: we bind all inputs here, including at::Tensors. This means that // expr_eval will create a PolymorphicValue containing *args[i], which means // that at::Tensor's lifetime will be at least as long as that of expr_eval. - expr_eval.bind(inputs[i], *args[i], true); + try { + expr_eval.bind(inputs[i], *args[i], true); + } catch (const nvfError& e) { + std::stringstream ss; + ss << "When trying to run the provided host program," + << " there was an error with the provided input " << i + << ". Provided input was:\n "; + ss << PolymorphicValue_functions::toString(*args[i]); + ss << "\n Fusion input is:\n "; + ss << inputs[i]->toString(); + ss << "\n Expr eval provided the error:\n\"\"\""; + ss << e.msg() << "\"\"\"\n"; + NVF_THROW(ss.str()); + } } return expr_eval; diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 764ea83aa50..0fd62022098 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -352,6 +352,135 @@ void ExpressionEvaluator::print() const { debug() << "--------------------\n\n"; } +namespace { +// Error handling for +// ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) +void handlePropagateError( + Fusion* fusion, + ExpressionEvaluator* expr_eval, + const std::shared_ptr>& id_set) { + std::unordered_map id_to_size; + std::set sizes; + + for (const auto id : *id_set) { + auto eval_val = expr_eval->evaluate(id->extent()); + if (eval_val.hasValue()) { + NVF_ERROR( + eval_val.is(), + "Invalid extent value found, while processing ID: ", + id); + id_to_size[id] = eval_val.as(); + sizes.insert(eval_val.as()); + } + } + + std::stringstream err_msg; + err_msg + << "When trying to propagate constant tensor sizes through the graph a conflict was found with " + << sizes.size() + << " different sizes across dimensions that are expected to match.\n"; + + // Track which size is associated with which TV and IterDomain + std::unordered_map< + int64_t, + std::vector>> + size_to_info; + + for (auto tv : fusion->allTvs()) { + for (const IterDomain* id : tv->domain()->allIDs()) { + if (auto it = id_to_size.find(id); it != id_to_size.end()) { + size_to_info[it->second].push_back({tv, id}); + } + } + } + + // Check which TensorViews mismatch and check if they're directly related. + // If so, the expression between them may be the problematic expression, and + // the error will point to that expression(s). Don't bother to speed up this + // check as it only runs after an error is detected. + bool found_producer_consumer_issue = false; + // Assume producer/consumer relationship + for (auto [dim_size_1, tv_id_pairs_1] : size_to_info) { + for (auto [dim_size_2, tv_id_pairs_2] : size_to_info) { + if (dim_size_1 <= dim_size_2) { + // N^2 algorithm, only process when one size is less than the other, + // avoids duplicate entries. + continue; + } + + for (const auto& [tv1, id1] : tv_id_pairs_1) { + for (const auto& [tv2, id2] : tv_id_pairs_2) { + bool tv1_is_consumer = false; + + // Check for producer-consumer relationship + auto producer_tvs_of_tv1 = ir_utils::producerTvsOf(tv1); + auto producer_tvs_of_tv2 = ir_utils::producerTvsOf(tv2); + if (std::find( + producer_tvs_of_tv1.begin(), + producer_tvs_of_tv1.end(), + tv2) != producer_tvs_of_tv1.end()) { + tv1_is_consumer = true; + } else if ( + std::find( + producer_tvs_of_tv2.begin(), + producer_tvs_of_tv2.end(), + tv1) == producer_tvs_of_tv2.end()) { + // No relationship found, skip + continue; + } + + Expr* relationship = + tv1_is_consumer ? tv1->definition() : tv2->definition(); + + // Found at least one consumer/producer relationship with mismatched + // sizes. + found_producer_consumer_issue = true; + + // Produce error message. Normally I'd just use swap but keys in an + // unordered map are const, so doing some juggling with the strings + // instead. + std::stringstream tv1_error; + std::stringstream tv2_error; + tv1_error << " TV: " << tv1 << " id: " << id1 + << " found size: " << dim_size_1 << "\n"; + tv2_error << " TV: " << tv2 << " id: " << id2 + << " found size: " << dim_size_2 << "\n"; + err_msg << " For Producer" + << (tv1_is_consumer ? tv2_error.str() : tv1_error.str()); + err_msg << " For Consumer" + << (tv1_is_consumer ? tv1_error.str() : tv2_error.str()); + err_msg + << " With producer-consumer relationship through the expression: " + << relationship << "\n"; + } + } + } + } + + if (found_producer_consumer_issue) { + NVF_THROW(err_msg.str()); + } + + if (size_to_info.size() > 1) { + for (const auto& [size, info_pairs] : size_to_info) { + err_msg << "Size " << size << " found for ID, in TV:\n"; + for (auto info_pair : info_pairs) { + err_msg << " " << info_pair.second << ", " << info_pair.first << "\n"; + } + } + err_msg << "These sizes should all match.\n"; + } else { + err_msg + << "Something went wrong trying to detect what went wrong!" + << " There should have been ID's in TVs that should match, but don't." + << " Somehow IDs were registered with the exact graph that aren't used in the Fusion." + << std::endl; + } + + NVF_THROW(err_msg.str()); +} +} // namespace + void ExpressionEvaluator::propagateBoundValuesThroughExactMaps( Fusion* fusion, ExactLogicalDomainMap* exact_map) { @@ -373,15 +502,10 @@ void ExpressionEvaluator::propagateBoundValuesThroughExactMaps( if (eval_val.hasValue()) { NVF_ERROR(eval_val.is(), "Invalid extent value"); int64_t this_size = eval_val.as(); - if (known_size != -1) { - NVF_ERROR( - known_size == this_size, - "Conflicting sizes: ", - known_size, - ", ", - this_size); - } else { + if (known_size == -1) { known_size = this_size; + } else if (known_size != this_size) { + handlePropagateError(fusion, this, set); } } else { unknown_vals.push_back(id->extent()); diff --git a/csrc/kernel_cache.h b/csrc/kernel_cache.h index 8f41fa9cade..e2fecaacbba 100644 --- a/csrc/kernel_cache.h +++ b/csrc/kernel_cache.h @@ -671,13 +671,6 @@ class FusionExecutorCache { //! Deserialize Fusion Executor Cache using flatbuffers void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id); - //! Allocate the outputs of the Fusion given inputs - //! TODO: re-implement - std::vector allocOutputSpace( - const at::ArrayRef& inputs) { - return runFusionWithInputs(inputs); - } - private: //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached //! entry in `FusionExecutor` diff --git a/csrc/utils.cpp b/csrc/utils.cpp index 2c372b95577..4dcf877c1e9 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -37,86 +37,33 @@ c10::ThreadPool* getThreadPool() { return &pool; } -C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function") -void debugPrint(const c10::TensorTypePtr& type) { - std::stringstream sizes_s; - if (auto sizes = type->symbolic_sizes().sizes()) { - for (const auto& shape_symbol : *sizes) { - if (shape_symbol.is_static()) { - sizes_s << shape_symbol.static_size() << ", "; - } else { - sizes_s << "s(" << *reinterpret_cast(&shape_symbol) - << "), "; - } - } - } else { - sizes_s << "no size available"; - } - debug() << "sizes:" << sizes_s.str() << std::endl; - if (const auto& stride_properties = type->stride_properties().sizes()) { - std::stringstream stride_s; - std::stringstream index_s; - std::stringstream contig_s; - - for (const auto& stride_property : *stride_properties) { - if (stride_property.has_value() && stride_property->stride_.has_value()) { - stride_s << *stride_property->stride_ << ", "; - } else { - stride_s << "?, "; - } - if (stride_property.has_value() && - stride_property->stride_index_.has_value()) { - index_s << *stride_property->stride_index_ << ", "; - } else { - index_s << "?, "; - } - if (stride_property.has_value() && - stride_property->contiguous_.has_value()) { - contig_s << *stride_property->contiguous_ << ", "; - } else { - contig_s << "?, "; - } - } - debug() << "stride: " << stride_s.str() << std::endl; - debug() << "stride index: " << index_s.str() << std::endl; - debug() << "contiguous: " << contig_s.str() << std::endl; - } else { - debug() << "no stride properties available" << std::endl; +std::string debug_str(const c10::IValue& val) { + if (val.isTensor()) { + return debug_str(val.toTensor()); } -} -C10_DIAGNOSTIC_POP() -bool is_zero_dim_tensor(const std::shared_ptr& tensor_type) { - return tensor_type && tensor_type->dim().has_value() && - tensor_type->dim().value() == 0; + std::stringstream out_s; + out_s << val.toScalar(); + return out_s.str(); } -bool is_zero_sized_tensor(const std::shared_ptr& tensor_type) { - auto opt_sizes = tensor_type->sizes().concrete_sizes(); - if (opt_sizes.has_value()) { - auto sizes = opt_sizes.value(); - for (const auto& size : sizes) { - if (size == 0) { - return true; - } - } +std::string debug_str(const at::Tensor& tensor) { + std::stringstream ss; + ss << "Tensor:"; + ss << " device: " << tensor.device(); + ss << ", dtype: " << tensor.dtype(); + ss << ", shape: " << tensor.sizes(); + + if (!tensor.is_contiguous()) { + ss << ", strides: " << tensor.strides(); } - return false; + return ss.str(); } bool is_cpu_scalar(const at::Tensor& tensor) { return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0; } -bool is_cpu_scalar(const c10::TensorType& tensor_type) { - auto opt_device = tensor_type.device(); - auto opt_dim = tensor_type.dim(); - auto opt_numel = tensor_type.numel(); - return opt_device.has_value() && opt_device->is_cpu() && - opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 && - opt_numel.value() == 1; -} - bool is_meta_scalar(const at::Tensor& tensor) { return tensor.device().is_meta() && tensor.numel() == 1 && tensor.dim() == 0; } diff --git a/csrc/utils.h b/csrc/utils.h index 43e71b2be83..97383645d71 100644 --- a/csrc/utils.h +++ b/csrc/utils.h @@ -52,13 +52,10 @@ namespace nvfuser { int getNumThreads(); c10::ThreadPool* getThreadPool(); -void debugPrint(const c10::TensorTypePtr& type); - -bool is_zero_dim_tensor(const std::shared_ptr& tensor_type); -bool is_zero_sized_tensor(const std::shared_ptr& tensor_type); +std::string debug_str(const c10::IValue& val); +std::string debug_str(const at::Tensor& tensor); bool is_cpu_scalar(const at::Tensor& tensor); -bool is_cpu_scalar(const c10::TensorType& tensor_type); bool is_meta_scalar(const at::Tensor& tensor); diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 29190520e44..725e70f0885 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -3839,8 +3839,8 @@ TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) { { ASSERT_THAT( [&]() { matched_add(a, c); }, - ::testing::ThrowsMessage( - ::testing::HasSubstr("Conflicting sizes"))); + ::testing::ThrowsMessage(::testing::HasSubstr( + "When trying to propagate constant tensor sizes through the graph a conflict was found with 2 different sizes across dimensions that are expected to match."))); GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; } } diff --git a/tests/python/opinfo_input_generators.py b/tests/python/opinfo_input_generators.py index 7dc12e87956..e369c03f9e7 100644 --- a/tests/python/opinfo_input_generators.py +++ b/tests/python/opinfo_input_generators.py @@ -898,7 +898,7 @@ def pad_error_generator( delete_all_pad_width = [-3, 0, 0, 0] yield SampleInput( make_arg(input_shape), delete_all_pad_width, make_number(dtype) - ), RuntimeError, "extent_int >= 0" + ), RuntimeError, "Invalid resized domain extent" too_many_pad_width = [1, 1, 1, 1, 1, 1] yield SampleInput(