Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor cleanup in executor.cpp #2750

Merged
merged 16 commits into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions csrc/exceptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,17 +253,20 @@ inline const char* nvfCheckMsgImpl(const char* /*msg*/, const char* args) {
#define STRINGIZE_IMPL(x) #x
#define STRINGIZE(x) STRINGIZE_IMPL(x)

#define NVF_ERROR(cond, ...) \
if ((!(cond))) { \
nvfuser::nvfErrorFail( \
#define NVF_THROW(...) \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wanted a throw to be able to get some better error messages in bindInputs.

nvfuser::nvfErrorFail( \
__FUNCTION__, \
__FILE__, \
static_cast<uint32_t>(__LINE__), \
#cond " INTERNAL ASSERT FAILED at " \
STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \
" INTERNAL ASSERT FAILED at " \
STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \
", please report a bug with repro script to NVFuser at " \
"https://github.com/NVIDIA/Fuser/issues. ", \
nvfuser::to_str(__VA_ARGS__)); \
"https://github.com/NVIDIA/Fuser/issues. ", \
nvfuser::to_str(__VA_ARGS__));

#define NVF_ERROR(cond, ...) \
if ((!(cond))) { \
NVF_THROW(__VA_ARGS__) \
}

#define NVF_CHECK_MSG(cond, type, ...) \
Expand Down
95 changes: 46 additions & 49 deletions csrc/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1348,35 +1348,38 @@ std::vector<FusionExecutor::GlobalBufferInfo> FusionExecutor::

namespace {

FusionExecutor::GlobalBufferInfo getBufferInfo(
ExpressionEvaluator& expr_eval,
DataType index_dtype,
TensorView* tv) {
FusionExecutor::GlobalBufferInfo info;
info.tv = tv;
std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
auto dtype =
(info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
info.type = data_type_to_aten(dtype);
return info;
}

//! Return information necessary for allocating output tensors. Input
//! and output tensors are allowed to alias each other, which is
//! specified by the list of int pairs of input and output indices
std::vector<FusionExecutor::GlobalBufferInfo> getOutputBufferInfo(
const KernelArgumentHolder& args,
std::vector<FusionExecutor::GlobalBufferInfo> getBufferInfos(
ExpressionEvaluator& expr_eval,
DataType index_dtype,
const Fusion* fusion) {
std::vector<FusionExecutor::GlobalBufferInfo> outputs;
outputs.reserve(fusion->outputs().size());
NVF_ERROR(
args.size() == fusion->inputs().size(),
"fusion arguments length does not match runtime arguments.");
for (const auto out_i : c10::irange(fusion->outputs().size())) {
auto out_val = fusion->outputs()[out_i];
const std::vector<Val*>& fusion_outputs) {
FUSER_PERF_SCOPE("FusionExecutor::getOutbufferInfo");
std::vector<FusionExecutor::GlobalBufferInfo> output_buffer_infos;
output_buffer_infos.reserve(fusion_outputs.size());
for (const auto out : fusion_outputs) {
NVF_ERROR(
out_val->isA<TensorView>(),
out->isA<TensorView>(),
"Cannot allocate outputs that are not tensors.");

FusionExecutor::GlobalBufferInfo info;
info.tv = out_val->as<TensorView>();
std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
auto dtype =
(info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
info.type = data_type_to_aten(dtype);

outputs.emplace_back(info);
output_buffer_infos.emplace_back(
getBufferInfo(expr_eval, index_dtype, out->as<TensorView>()));
}
return outputs;
return output_buffer_infos;
}

} // namespace
Expand All @@ -1389,7 +1392,7 @@ std::vector<at::Tensor> allocOutputSpace(
auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion);

auto output_info =
getOutputBufferInfo(fusion_inputs, expr_eval, PrimDataType::Int, fusion);
getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs());

return allocateOutputs(fusion, output_info, device, expr_eval);
}
Expand Down Expand Up @@ -1421,8 +1424,8 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(

auto arg_index_type = args.getSmallestIndexTypeOfArguments();

KernelArgumentHolder ret;
ret.setDeviceIndex(args.getDeviceIndex());
KernelArgumentHolder output_tensor_proxies;
output_tensor_proxies.setDeviceIndex(args.getDeviceIndex());

for (Val* output : fusion->outputs()) {
NVF_ERROR(
Expand All @@ -1433,9 +1436,9 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(
const auto dtype = (output_tv->dtype() == DataType::Index)
? data_type_to_aten(arg_index_type)
: data_type_to_aten(output_tv->dtype());
ret.pushTensorProxy(sizes, strides, dtype);
output_tensor_proxies.pushTensorProxy(sizes, strides, dtype);
}
return ret;
return output_tensor_proxies;
}

namespace {
Expand Down Expand Up @@ -1547,15 +1550,6 @@ void dumpKernelArgs(
}
}

FusionExecutor::GlobalBufferInfo getGlobalBufferAllocationInfo(
const at::Tensor& at_tensor) {
FusionExecutor::GlobalBufferInfo info{
.sizes = at_tensor.sizes().vec(),
.strides = at_tensor.strides().vec(),
.type = at_tensor.scalar_type()};
return info;
}

} // namespace

void FusionExecutor::initializeExecutorEntry(
Expand Down Expand Up @@ -1587,13 +1581,16 @@ void FusionExecutor::initializeExecutorEntry(

if (outputs.empty()) {
output_info =
getOutputBufferInfo(args, expr_eval, index_type, lowered_->kernel());
getBufferInfos(expr_eval, index_type, lowered_->kernel()->outputs());
} else {
// Need to save the information necessary for allocations as
// future uses of this ExecutorEntry may not be provided with
// allocated outputs
for (const auto& output : outputs) {
output_info.emplace_back(getGlobalBufferAllocationInfo(output));
output_info.emplace_back(FusionExecutor::GlobalBufferInfo{
.sizes = output.sizes().vec(),
.strides = output.strides().vec(),
.type = output.scalar_type()});
}
}

Expand Down Expand Up @@ -1849,10 +1846,13 @@ void FusionExecutor::resetCompiledKernelProperties() {
}

std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
KernelArgumentHolder& args,
std::vector<at::Tensor> outputs,
ExpressionEvaluator& expr_eval) {
// TODO: Add relevant profiling code.
FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs");
NVF_ERROR(
outputs.empty(),
"Fusion executor is using expression evaluator,",
" and expects that the outputs are not populated, which they were.");
if (outputs.empty()) {
for (const auto& out_val : fusion()->outputs()) {
auto out_tensor =
Expand All @@ -1861,11 +1861,12 @@ std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
outputs.emplace_back(out_tensor);
}
}
args.push(outputs);
return outputs;
}

namespace {
// Host IR specific function, returns the at:Tensor (ordered list) associated
// with the provdied Fusion output tv
at::Tensor findBufferForFusionOutput(
const std::vector<at::Tensor>& out_tensors,
const Val* fusion_out,
Expand Down Expand Up @@ -1903,14 +1904,10 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
" provided number of outputs does not match fusion output");

// Bind fusion inputs
ExpressionEvaluator expr_eval;
const auto& inputs = fusion()->inputs();
for (const auto i : c10::irange(inputs.size())) {
expr_eval.bind(inputs[i], *args[i]);
}
auto expr_eval = executor_utils::bindInputs(args, fusion());

if (isExpressionEvaluated(fusion())) {
outputs = evaluateFusionOutputs(args, outputs, expr_eval);
outputs = evaluateFusionOutputs(outputs, expr_eval);
jjsjann123 marked this conversation as resolved.
Show resolved Hide resolved
if (isProfilerEnabled()) {
auto& sprof = FusionProfiler::segment(group_id_);
sprof.stopKernel();
Expand All @@ -1921,8 +1918,8 @@ std::vector<at::Tensor> FusionExecutor::runFusion(

if (host_ir_container_ != nullptr) {
if (outputs.empty()) {
std::vector<GlobalBufferInfo> output_info = getOutputBufferInfo(
args, expr_eval, PrimDataType::Int, host_ir_container_.get());
std::vector<GlobalBufferInfo> output_info = getBufferInfos(
expr_eval, PrimDataType::Int, host_ir_container_->outputs());
outputs = allocateOutputs(
host_ir_container_.get(), output_info, options_.device, expr_eval);
}
Expand Down Expand Up @@ -2009,7 +2006,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
// Skip trivially forwarded outputs because they are just placeholders
continue;
}
expr_eval.bind(output, *args[inputs.size() + i]);
expr_eval.bind(output, *args[kernel()->inputs().size() + i]);
}

std::vector<at::Tensor> intermediates;
Expand Down Expand Up @@ -2062,7 +2059,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
intermediates.push_back(intermediate_buffer);
expr_eval.bind(
kernel()->summary().global_allocations.at(i)->buffer(),
*args[inputs.size() + outputs.size() + i]);
*args[kernel()->inputs().size() + outputs.size() + i]);
if (buf_info.is_profile_buffer) {
profile_buffer = intermediate_buffer;
}
Expand Down
8 changes: 6 additions & 2 deletions csrc/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class FusionExecutor : public NonCopyable {
//! Notes: 1. This API should ignore aliased outputs instead of
//! pushing scalar int 0 as a place-holder.
//! 2. This API does not allocate output in memory, but only returns the
//! inferred output sizes.
//! inferred output sizes. Used in kernel_cache.cpp.
KernelArgumentHolder inferOutputSizes(
Fusion* fusion,
const KernelArgumentHolder& args,
Expand Down Expand Up @@ -118,10 +118,14 @@ class FusionExecutor : public NonCopyable {

//! Computes fusion outputs through expression evaluator.
std::vector<at::Tensor> evaluateFusionOutputs(
KernelArgumentHolder& args,
std::vector<at::Tensor> outputs,
ExpressionEvaluator& expr_eval);

// TODO: args shouldn't come in a reference here because we will append the
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jjsjann123 see the todo here. We don't ever use args as they're updated with the outputs. We always pass it back as an array of tensors. So it is different behavior with evaluateFusionOutputs, but that seems to be the behavior we should want.

// outputs to be able to send it to the kernel. For now none of the users are
// reconsuming the args, so it is okay. It isn't done now because changing it
// from a reference makes a call as runFusion({}) ambiguous, and that is used
// in some places in the codebase.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be really interesting to see how we can resolve this cleanly, but I don't know any way to do that without changing the call site to not use braced-init-list. This gives me headache: https://en.cppreference.com/w/cpp/language/overload_resolution#Implicit_conversion_sequence_in_list-initialization

I don't know anything that can help overload resolution? wondering if @zasdfgbnm knows any dark magic?

NVF_API std::vector<at::Tensor> runFusion(
KernelArgumentHolder& args,
const LaunchParams& launch_constraints = LaunchParams(),
Expand Down
15 changes: 14 additions & 1 deletion csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,20 @@ ExpressionEvaluator bindInputs(
// NOTE: we bind all inputs here, including at::Tensors. This means that
// expr_eval will create a PolymorphicValue containing *args[i], which means
// that at::Tensor's lifetime will be at least as long as that of expr_eval.
expr_eval.bind(inputs[i], *args[i], true);
try {
expr_eval.bind(inputs[i], *args[i], true);
} catch (const nvfError& e) {
std::stringstream ss;
ss << "When trying to run the provided host program,"
<< " there was an error with the provided input " << i
<< ". Provided input was:\n ";
ss << PolymorphicValue_functions::toString(*args[i]);
ss << "\n Fusion input is:\n ";
ss << inputs[i]->toString();
ss << "\n Expr eval provided the error:\n\"\"\"";
ss << e.msg() << "\"\"\"\n";
NVF_THROW(ss.str());
}
}

return expr_eval;
Expand Down
Loading
Loading