Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor cleanup in executor.cpp #2750

Merged
merged 16 commits into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions csrc/exceptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,17 +253,20 @@ inline const char* nvfCheckMsgImpl(const char* /*msg*/, const char* args) {
#define STRINGIZE_IMPL(x) #x
#define STRINGIZE(x) STRINGIZE_IMPL(x)

#define NVF_ERROR(cond, ...) \
if ((!(cond))) { \
nvfuser::nvfErrorFail( \
#define NVF_THROW(...) \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wanted a throw to be able to get some better error messages in bindInputs.

nvfuser::nvfErrorFail( \
__FUNCTION__, \
__FILE__, \
static_cast<uint32_t>(__LINE__), \
#cond " INTERNAL ASSERT FAILED at " \
STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \
" INTERNAL ASSERT FAILED at " \
STRINGIZE(__FILE__) ":" STRINGIZE(__LINE__) \
", please report a bug with repro script to NVFuser at " \
"https://github.com/NVIDIA/Fuser/issues. ", \
nvfuser::to_str(__VA_ARGS__)); \
"https://github.com/NVIDIA/Fuser/issues. ", \
nvfuser::to_str(__VA_ARGS__));

#define NVF_ERROR(cond, ...) \
if ((!(cond))) { \
NVF_THROW(__VA_ARGS__) \
}

#define NVF_CHECK_MSG(cond, type, ...) \
Expand Down
94 changes: 45 additions & 49 deletions csrc/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1353,36 +1353,38 @@ std::vector<FusionExecutor::GlobalBufferInfo> FusionExecutor::

namespace {

FusionExecutor::GlobalBufferInfo getBufferInfo(
ExpressionEvaluator& expr_eval,
DataType index_dtype,
TensorView* tv) {
FusionExecutor::GlobalBufferInfo info;
info.tv = tv;
std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
auto dtype =
(info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
info.type = data_type_to_aten(dtype);
return info;
}

//! Return information necessary for allocating output tensors. Input
//! and output tensors are allowed to alias each other, which is
//! specified by the list of int pairs of input and output indices
std::vector<FusionExecutor::GlobalBufferInfo> getOutputBufferInfo(
const KernelArgumentHolder& args,
std::vector<FusionExecutor::GlobalBufferInfo> getBufferInfos(
ExpressionEvaluator& expr_eval,
DataType index_dtype,
const Fusion* fusion) {
const std::vector<Val*>& fusion_outputs) {
FUSER_PERF_SCOPE("FusionExecutor::getOutbufferInfo");
std::vector<FusionExecutor::GlobalBufferInfo> outputs;
outputs.reserve(fusion->outputs().size());
NVF_ERROR(
args.size() == fusion->inputs().size(),
"fusion arguments length does not match runtime arguments.");
for (const auto out_i : c10::irange(fusion->outputs().size())) {
auto out_val = fusion->outputs()[out_i];
std::vector<FusionExecutor::GlobalBufferInfo> output_buffer_infos;
output_buffer_infos.reserve(fusion_outputs.size());
for (const auto out : fusion_outputs) {
NVF_ERROR(
out_val->isA<TensorView>(),
out->isA<TensorView>(),
"Cannot allocate outputs that are not tensors.");

FusionExecutor::GlobalBufferInfo info;
info.tv = out_val->as<TensorView>();
std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
auto dtype =
(info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
info.type = data_type_to_aten(dtype);

outputs.emplace_back(info);
output_buffer_infos.emplace_back(
getBufferInfo(expr_eval, index_dtype, out->as<TensorView>()));
}
return outputs;
return output_buffer_infos;
}

} // namespace
Expand All @@ -1395,7 +1397,7 @@ std::vector<at::Tensor> allocOutputSpace(
auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion);

auto output_info =
getOutputBufferInfo(fusion_inputs, expr_eval, PrimDataType::Int, fusion);
getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs());

return allocateOutputs(fusion, output_info, device, expr_eval);
}
Expand Down Expand Up @@ -1427,8 +1429,8 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(

auto arg_index_type = args.getSmallestIndexTypeOfArguments();

KernelArgumentHolder ret;
ret.setDeviceIndex(args.getDeviceIndex());
KernelArgumentHolder output_tensor_proxies;
output_tensor_proxies.setDeviceIndex(args.getDeviceIndex());

for (Val* output : fusion->outputs()) {
NVF_ERROR(
Expand All @@ -1439,9 +1441,9 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(
const auto dtype = (output_tv->dtype() == DataType::Index)
? data_type_to_aten(arg_index_type)
: data_type_to_aten(output_tv->dtype());
ret.pushTensorProxy(sizes, strides, dtype);
output_tensor_proxies.pushTensorProxy(sizes, strides, dtype);
}
return ret;
return output_tensor_proxies;
}

namespace {
Expand Down Expand Up @@ -1553,15 +1555,6 @@ void dumpKernelArgs(
}
}

FusionExecutor::GlobalBufferInfo getGlobalBufferAllocationInfo(
const at::Tensor& at_tensor) {
FusionExecutor::GlobalBufferInfo info{
.sizes = at_tensor.sizes().vec(),
.strides = at_tensor.strides().vec(),
.type = at_tensor.scalar_type()};
return info;
}

} // namespace

void FusionExecutor::initializeExecutorEntry(
Expand Down Expand Up @@ -1591,13 +1584,16 @@ void FusionExecutor::initializeExecutorEntry(

if (outputs.empty()) {
output_info =
getOutputBufferInfo(args, expr_eval, index_type, lowered_->kernel());
getBufferInfos(expr_eval, index_type, lowered_->kernel()->outputs());
} else {
// Need to save the information necessary for allocations as
// future uses of this ExecutorEntry may not be provided with
// allocated outputs
for (const auto& output : outputs) {
output_info.emplace_back(getGlobalBufferAllocationInfo(output));
output_info.emplace_back(FusionExecutor::GlobalBufferInfo{
.sizes = output.sizes().vec(),
.strides = output.strides().vec(),
.type = output.scalar_type()});
}
}

Expand Down Expand Up @@ -1853,22 +1849,26 @@ void FusionExecutor::resetCompiledKernelProperties() {
}

std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
KernelArgumentHolder& args,
std::vector<at::Tensor> outputs,
ExpressionEvaluator& expr_eval) {
// TODO: Add relevant profiling code.
FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs");
NVF_ERROR(
outputs.empty(),
"Fusion executor is using expression evaluator,",
" and expects that the outputs are not populated, which they were.");
if (outputs.empty()) {
for (const auto& out_val : fusion()->outputs()) {
auto out_tensor =
expr_eval.evaluate(out_val->as<TensorView>()).as<at::Tensor>();
outputs.emplace_back(out_tensor);
}
}
args.push(outputs);
return outputs;
}

namespace {
// Host IR specific function, returns the at:Tensor (ordered list) associated
// with the provdied Fusion output tv
at::Tensor findBufferForFusionOutput(
const std::vector<at::Tensor>& out_tensors,
const Val* fusion_out,
Expand Down Expand Up @@ -1906,14 +1906,10 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
" provided number of outputs does not match fusion output");

// Bind fusion inputs
ExpressionEvaluator expr_eval;
const auto& inputs = fusion()->inputs();
for (const auto i : c10::irange(inputs.size())) {
expr_eval.bind(inputs[i], *args[i]);
}
auto expr_eval = executor_utils::bindInputs(args, fusion());

if (isExpressionEvaluated(fusion())) {
outputs = evaluateFusionOutputs(args, outputs, expr_eval);
outputs = evaluateFusionOutputs(outputs, expr_eval);
jjsjann123 marked this conversation as resolved.
Show resolved Hide resolved
if (isProfilerEnabled()) {
auto& sprof = FusionProfiler::segment(group_id_);
sprof.stopKernel();
Expand All @@ -1924,8 +1920,8 @@ std::vector<at::Tensor> FusionExecutor::runFusion(

if (host_ir_container_ != nullptr) {
if (outputs.empty()) {
std::vector<GlobalBufferInfo> output_info = getOutputBufferInfo(
args, expr_eval, PrimDataType::Int, host_ir_container_.get());
std::vector<GlobalBufferInfo> output_info = getBufferInfos(
expr_eval, PrimDataType::Int, host_ir_container_.get()->outputs());
csarofeen marked this conversation as resolved.
Show resolved Hide resolved
outputs = allocateOutputs(
host_ir_container_.get(), output_info, options_.device, expr_eval);
}
Expand Down Expand Up @@ -2012,7 +2008,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
// Skip trivially forwarded outputs because they are just placeholders
continue;
}
expr_eval.bind(output, *args[inputs.size() + i]);
expr_eval.bind(output, *args[kernel()->inputs().size() + i]);
}

std::vector<at::Tensor> intermediates;
Expand Down Expand Up @@ -2066,7 +2062,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
intermediates.push_back(intermediate_buffer);
expr_eval.bind(
kernel()->summary().global_allocations.at(i)->buffer(),
*args[inputs.size() + outputs.size() + i]);
*args[kernel()->inputs().size() + outputs.size() + i]);
if (buf_info.is_profile_buffer) {
profile_buffer = intermediate_buffer;
}
Expand Down
8 changes: 6 additions & 2 deletions csrc/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class FusionExecutor : public NonCopyable {
//! Notes: 1. This API should ignore aliased outputs instead of
//! pushing scalar int 0 as a place-holder.
//! 2. This API does not allocate output in memory, but only returns the
//! inferred output sizes.
//! inferred output sizes. Used in kernel_cache.cpp.
KernelArgumentHolder inferOutputSizes(
Fusion* fusion,
const KernelArgumentHolder& args,
Expand Down Expand Up @@ -118,10 +118,14 @@ class FusionExecutor : public NonCopyable {

//! Computes fusion outputs through expression evaluator.
std::vector<at::Tensor> evaluateFusionOutputs(
KernelArgumentHolder& args,
std::vector<at::Tensor> outputs,
ExpressionEvaluator& expr_eval);

// TODO: args shouldn't come in a reference here because we will append the
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jjsjann123 see the todo here. We don't ever use args as they're updated with the outputs. We always pass it back as an array of tensors. So it is different behavior with evaluateFusionOutputs, but that seems to be the behavior we should want.

// outputs to be able to send it to the kernel. For now none of the users are
// reconsuming the args, so it is okay. It isn't done now because changing it
// from a reference makes a call as runFusion({}) ambiguous, and that is used
// in some places in the codebase.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be really interesting to see how we can resolve this cleanly, but I don't know any way to do that without changing the call site to not use braced-init-list. This gives me headache: https://en.cppreference.com/w/cpp/language/overload_resolution#Implicit_conversion_sequence_in_list-initialization

I don't know anything that can help overload resolution? wondering if @zasdfgbnm knows any dark magic?

NVF_API std::vector<at::Tensor> runFusion(
KernelArgumentHolder& args,
const LaunchParams& launch_constraints = LaunchParams(),
Expand Down
13 changes: 12 additions & 1 deletion csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,18 @@ ExpressionEvaluator bindInputs(
// NOTE: we bind all inputs here, including at::Tensors. This means that
// expr_eval will create a PolymorphicValue containing *args[i], which means
// that at::Tensor's lifetime will be at least as long as that of expr_eval.
expr_eval.bind(inputs[i], *args[i], true);
try {
expr_eval.bind(inputs[i], *args[i], true);
} catch (const nvfError& e) {
std::stringstream ss;
ss << "When trying to run the provided host program,"
<< " there was an error with the provided input " << i
<< ". Provided input was:\n ";
ss << PolymorphicValue_functions::toString(*args[i]);
ss << "\n which does not match the expected input:\n ";
ss << inputs[i]->toString() << "\n";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is e.what() worth keeping in the new exception message? It has the original source code location at least, which I think helps debugging.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this instance I didn't think it was particularly helpful, as it's the caller that we would typically want to point at, not the inside of expression evaluator which is the first to find it. I think we'd actually want the error to be thrown higher where bindInputs is called as I expect most instances that would fail are because of providing bad inputs.

WDYT?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I trust your judgement -- I haven't ran into enough errors to have a strong opinion.

AFAICT, ExpressionEvaluator::bind is a deep function that can fail at

"Could not evaluate metadata expression for ",
. While I agree with you that "most instances that would fail are because of providing bad inputs", why an input is bad might not be immediately clear to the user.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, that's why I lifted the error in the common place we bind a bunch of inputs to expression evaluator provided from someplace not generated by nvFuser (developers in tests and Thunder in integration).

NVF_THROW(ss.str());
}
}

return expr_eval;
Expand Down
7 changes: 0 additions & 7 deletions csrc/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -670,13 +670,6 @@ class FusionExecutorCache {
//! Deserialize Fusion Executor Cache using flatbuffers
void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id);

//! Allocate the outputs of the Fusion given inputs
//! TODO: re-implement
std::vector<at::Tensor> allocOutputSpace(
const at::ArrayRef<c10::IValue>& inputs) {
return runFusionWithInputs(inputs);
}

private:
//! evict cached short cut entry in `code_to_fe_lookup_` as well as cached
//! entry in `FusionExecutor`
Expand Down
Loading
Loading