Skip to content

Commit

Permalink
Permute the inputs/outputs of runFusionWithInputs.
Browse files Browse the repository at this point in the history
  • Loading branch information
wujingyue committed Nov 5, 2024
1 parent e8a3608 commit ceba5c4
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 49 deletions.
91 changes: 47 additions & 44 deletions csrc/runtime/allocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <expr_evaluator.h>
#include <instrumentation.h>
#include <ir/utils.h>
#include <polymorphic_value.h>
#include <runtime/executor_kernel_arg.h>
#include <runtime/executor_utils.h>
Expand Down Expand Up @@ -676,50 +677,6 @@ class BackwardTraverseFromAllocToLogical {
}
};

// Start from a tensor whose dimensions are consistent with the allocation
// domain of tv, apply a sequence of view/permute to the tensor to transform it
// into a format whose dimensions are consistent with the logical domain of tv.
// For example, if the logical domain is [I1, I2], and the allocation domain is
// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t()
// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1].
// Another example, if the logical domain is [I1*I2] and the allocation domain
// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
// get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
at::Tensor transformOutputFromAllocationToLogical(
at::Tensor tensor,
TensorView* tv,
ExpressionEvaluator& ee) {
FUSER_PERF_SCOPE(
"fusion_executor::allocations::transformOutputFromAllocationToLogical");
// Ignore reductions because reductions does not exist in tensor's definition
auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
// Traverse all affine transformations from allocation domain. Because
// allocation domain can be before or after the logical domain, we need both a
// forward and a backward traverse.
std::list<IterDomain*> frontier(alloc.begin(), alloc.end());
NVF_ERROR(tensor.dim() == (int64_t)frontier.size());
tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
NVF_ERROR(frontier.size() == logical.size());
// Now that all affine transformations are handled, and frontiers should
// contain the same set of IDs as logical. We still need to do a final
// permutation so that their orders are also consistent.
std::unordered_map<IterDomain*, int64_t> current_dims;
int64_t counter = 0;
for (auto id : frontier) {
current_dims[id] = counter++;
}
std::vector<int64_t> dims;
dims.reserve(frontier.size());
for (auto id : logical) {
dims.emplace_back(current_dims.at(id));
}
return tensor.permute(dims);
}

} // namespace

std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
Expand Down Expand Up @@ -766,4 +723,50 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
}

at::Tensor transformOutputFromAllocationToLogical(
at::Tensor tensor,
TensorView* tv,
ExpressionEvaluator& ee) {
FUSER_PERF_SCOPE("allocations::transformOutputFromAllocationToLogical");
// Ignore reductions because reductions does not exist in tensor's definition
auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
// Traverse all affine transformations from allocation domain. Because
// allocation domain can be before or after the logical domain, we need both a
// forward and a backward traverse.
std::list<IterDomain*> frontier(alloc.begin(), alloc.end());
NVF_ERROR(tensor.dim() == (int64_t)frontier.size());
tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
NVF_ERROR(frontier.size() == logical.size());
// Now that all affine transformations are handled, and frontiers should
// contain the same set of IDs as logical. We still need to do a final
// permutation so that their orders are also consistent.
std::unordered_map<IterDomain*, int64_t> current_dims;
int64_t counter = 0;
for (auto id : frontier) {
current_dims[id] = counter++;
}
std::vector<int64_t> dims;
dims.reserve(frontier.size());
for (auto id : logical) {
dims.emplace_back(current_dims.at(id));
}
return tensor.permute(dims);
}

at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv) {
FUSER_PERF_SCOPE("allocations::transformLogicalToAllocation");
// Ignore reductions because reduction dimensions are not allocated in
// `tensor`.
auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());

std::vector<int64_t> permutation =
*ir_utils::computePermutation(logical, alloc);
return tensor.permute(permutation);
}

} // namespace nvfuser
16 changes: 16 additions & 0 deletions csrc/runtime/allocations.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,20 @@ std::vector<GlobalBufferInfo> getBufferInfos(
DataType index_dtype,
const std::vector<Val*>& fusion_outputs);

// Start from a tensor whose dimensions are consistent with the allocation
// domain of tv, apply a sequence of view/permute to the tensor to transform it
// into a format whose dimensions are consistent with the logical domain of tv.
// For example, if the logical domain is [I1, I2], and the allocation domain is
// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t()
// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1].
// Another example, if the logical domain is [I1*I2] and the allocation domain
// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
// get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
at::Tensor transformOutputFromAllocationToLogical(
at::Tensor tensor,
TensorView* tv,
ExpressionEvaluator& ee);

at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv);

} // namespace nvfuser
30 changes: 25 additions & 5 deletions csrc/runtime/fusion_executor_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,30 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
int seq_id = 0;
// Record kernel input and output tensors so profiler can construct
// the data flow graph
// FIXME: the inputs should be changed to match allocation.
RECORD_FUNCTION(
"run_fused_kernel",
std::vector<c10::IValue>(inputs.begin(), inputs.end()),
seq_id);
auto outputs = kernel_runtime->runWithInputs(args);
RECORD_OUTPUTS(outputs);
NVF_ERROR(fusion->outputs().size() == outputs.size());

// Kernel time measurement is off by default
kernel_runtime->disableKernelTimeMeasurement();

ExpressionEvaluator evaluator;
for (const auto out_index : c10::irange(outputs.size())) {
at::Tensor& output = outputs[out_index];
auto* out = fusion->outputs()[out_index]->as<TensorView>();
output = transformOutputFromAllocationToLogical(output, out, evaluator);
}

// Removing aliased outputs, since those are updated by the Fusion. It is not
// semantically correct to actually return them as outputs from
// fusion.
NVF_ERROR(fusion->outputs().size() == outputs.size());
size_t new_size = 0;
for (size_t out_index = 0; out_index < outputs.size(); out_index++) {
for (const auto out_index : c10::irange(outputs.size())) {
Val* out = fusion->outputs()[out_index];
if (!fusion->getOutputAlias(out).hide_output) {
outputs[new_size] = outputs[out_index];
Expand All @@ -124,8 +132,20 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
std::optional<int8_t> selected_device) {
FUSER_PERF_SCOPE("FusionExecutorCache::prepareInputs");

KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs, selected_device);
std::vector<c10::IValue> inputs_matching_allocation;
inputs_matching_allocation.reserve(inputs.size());
for (const auto i : c10::irange(inputs.size())) {
const auto& input = inputs[i];
if (!input.isTensor()) {
inputs_matching_allocation.push_back(input);
continue;
}
inputs_matching_allocation.push_back(transformFromLogicalToAllocation(
input.toTensor(), fusion_->inputs()[i]->as<TensorView>()));
}

KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(
inputs_matching_allocation, selected_device);

// TODO: move InputsIdLookup inside KernelArgumentHolder;
// NOTE: We must ensure that the cache id is in fact unique. Dynamic fusions
Expand All @@ -136,7 +156,7 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
// short-circuiting here, resulting in avoidable rebuilds of concretization
// info.
auto id_lookup_ret = inputs_id_lookup_.lookupId(
inputs,
inputs_matching_allocation,
initialInfo().scalarInputsAffectingConcretization(),
args.getDeviceIndex());
if (id_lookup_ret.eviction) {
Expand Down

0 comments on commit ceba5c4

Please sign in to comment.