Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Permute the inputs/outputs of runFusionWithInputs. #3342

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions csrc/dynamic_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,12 +1048,6 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
// check the root to logical transforms to be sure we have concretized any
// intermediate IterDomains.

// At this point, there should be no expr beyond rfactor root
NVF_ERROR(
tv->getLoopDomain() == tv->getLogicalDomain(),
"Invalid tensor: ",
tv->toString());

// If it has an root domain, the IterTypes of the logical
// IDs may need to be updated as well. Traverse the rfactor exprs
// and mutate the IterTypes of output IDs if symbolic.
Expand Down
25 changes: 13 additions & 12 deletions csrc/expr_evaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ void validateValWithConcreteValue(
concrete_value);
const auto& t = concrete_value.as<at::Tensor>();
auto expect_dim =
(int64_t)TensorDomain::noReductions(tv->getLogicalDomain()).size();
(int64_t)TensorDomain::noReductions(tv->getMaybeAllocationDomain())
.size();
NVF_CHECK(
t.dim() == expect_dim,
"Expected ",
Expand Down Expand Up @@ -133,21 +134,22 @@ void ExpressionEvaluator::bindTensorDomain(
const TensorView* tv,
const at::Tensor& t,
const bool evaluate_validate) {
auto logical_domain = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc_domain =
TensorDomain::noReductions(tv->getMaybeAllocationDomain());
NVF_ERROR(
t.dim() == (int64_t)logical_domain.size(),
t.dim() == (int64_t)alloc_domain.size(),
"Expected ",
getInputPosString(tv),
tv->toString(),
", to be bound to a tensor of rank ",
logical_domain.size(),
alloc_domain.size(),
", but got a tensor of rank ",
t.dim());
for (auto i : c10::irange(t.dim())) {
auto id = logical_domain[i];
auto id = alloc_domain[i];
if (id->isBroadcast()) {
// DIDs are ignored for broadcast.
bind_(logical_domain[i]->extent(), 1, evaluate_validate);
bind_(alloc_domain[i]->extent(), 1, evaluate_validate);
if (id->hasExpandedExtent()) {
// Verify that t is also expanded
NVF_ERROR(
Expand All @@ -164,11 +166,10 @@ void ExpressionEvaluator::bindTensorDomain(
t.stride(i),
" in dimension ",
i);
bind_(
logical_domain[i]->expandedExtent(), t.size(i), evaluate_validate);
bind_(alloc_domain[i]->expandedExtent(), t.size(i), evaluate_validate);
}
} else {
if (logical_domain[i]->isDeviceDim()) {
if (alloc_domain[i]->isDeviceDim()) {
// Currently we have the restrictions:
// (1) Devices parallelized axis extent == DeviceMesh's extent
// (2) Device parallelized axis cannot be split or merged
Expand All @@ -191,12 +192,12 @@ void ExpressionEvaluator::bindTensorDomain(
getInputPosString(tv),
" has an empty DeviceMesh with DID parallelization")
bind_(
logical_domain[i]->extent(),
alloc_domain[i]->extent(),
static_cast<int64_t>(
tv->getDeviceMesh().size(logical_domain[i]->getParallelType())),
tv->getDeviceMesh().size(alloc_domain[i]->getParallelType())),
evaluate_validate);
} else {
bind_(logical_domain[i]->extent(), t.size(i), evaluate_validate);
bind_(alloc_domain[i]->extent(), t.size(i), evaluate_validate);
}
}
}
Expand Down
102 changes: 51 additions & 51 deletions csrc/runtime/allocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <expr_evaluator.h>
#include <instrumentation.h>
#include <ir/utils.h>
#include <polymorphic_value.h>
#include <runtime/executor_kernel_arg.h>
#include <runtime/executor_utils.h>
Expand Down Expand Up @@ -404,7 +405,7 @@ namespace {

class ForwardTraverseFromAllocToLogical {
at::Tensor tensor_;
ExpressionEvaluator& ee_;
const ExpressionEvaluator& ee_;
std::list<IterDomain*>& frontier_;

// Forward traverse split from allocation to logical. Needs to, for example,
Expand Down Expand Up @@ -521,7 +522,7 @@ class ForwardTraverseFromAllocToLogical {
public:
ForwardTraverseFromAllocToLogical(
at::Tensor tensor,
ExpressionEvaluator& ee,
const ExpressionEvaluator& ee,
std::list<IterDomain*>& frontier)
: tensor_(std::move(tensor)), ee_(ee), frontier_(frontier) {}

Expand All @@ -541,7 +542,7 @@ class ForwardTraverseFromAllocToLogical {
// transformations.
class BackwardTraverseFromAllocToLogical {
at::Tensor tensor_;
ExpressionEvaluator& ee_;
const ExpressionEvaluator& ee_;
std::list<IterDomain*>& frontier_;

// Backward traverse split from allocation to logical. Needs to, for example,
Expand Down Expand Up @@ -645,7 +646,7 @@ class BackwardTraverseFromAllocToLogical {
public:
BackwardTraverseFromAllocToLogical(
at::Tensor tensor,
ExpressionEvaluator& ee,
const ExpressionEvaluator& ee,
std::list<IterDomain*>& frontier)
: tensor_(std::move(tensor)), ee_(ee), frontier_(frontier) {}

Expand All @@ -662,49 +663,6 @@ class BackwardTraverseFromAllocToLogical {
}
};

// Start from a tensor whose dimensions are consistent with the allocation
// domain of tv, apply a sequence of view/permute to the tensor to transform it
// into a format whose dimensions are consistent with the logical domain of tv.
// For example, if the logical domain is [I1, I2], and the allocation domain is
// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t()
// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1].
// Another example, if the logical domain is [I1*I2] and the allocation domain
// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
// get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
at::Tensor transformFromAllocationToLogical(
at::Tensor tensor,
TensorView* tv,
ExpressionEvaluator& ee) {
FUSER_PERF_SCOPE("allocations::transformFromAllocationToLogical");
// Ignore reductions because reductions does not exist in tensor's definition
auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
// Traverse all affine transformations from allocation domain. Because
// allocation domain can be before or after the logical domain, we need both a
// forward and a backward traverse.
std::list<IterDomain*> frontier(alloc.begin(), alloc.end());
NVF_ERROR(tensor.dim() == (int64_t)frontier.size());
tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
NVF_ERROR(frontier.size() == logical.size());
// Now that all affine transformations are handled, and frontiers should
// contain the same set of IDs as logical. We still need to do a final
// permutation so that their orders are also consistent.
std::unordered_map<IterDomain*, int64_t> current_dims;
int64_t counter = 0;
for (auto id : frontier) {
current_dims[id] = counter++;
}
std::vector<int64_t> dims;
dims.reserve(frontier.size());
for (auto id : logical) {
dims.emplace_back(current_dims.at(id));
}
return tensor.permute(dims);
}

} // namespace

std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
Expand Down Expand Up @@ -748,11 +706,53 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
c10::TensorOptions().device(c10::Device(c10::DeviceType::Meta));
auto meta_tensor =
at::empty_strided(size_stride.first, size_stride.second, options);
// TODO(jiej): we should refactor it here, there's no need to use
// meta_tensor at all, size + stride should be used directly in the
// `transformFromAllocationToLogical`
meta_tensor = transformFromAllocationToLogical(meta_tensor, tv, expr_eval);
return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
}

at::Tensor transformFromAllocationToLogical(
at::Tensor tensor,
TensorView* tv,
const ExpressionEvaluator& ee) {
FUSER_PERF_SCOPE("allocations::transformFromAllocationToLogical");
// Ignore reductions because reductions does not exist in tensor's definition
auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
// Traverse all affine transformations from allocation domain. Because
// allocation domain can be before or after the logical domain, we need both a
// forward and a backward traverse.
std::list<IterDomain*> frontier(alloc.begin(), alloc.end());
NVF_ERROR(tensor.dim() == (int64_t)frontier.size());
tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier)
.run(logical, alloc);
NVF_ERROR(frontier.size() == logical.size());
// Now that all affine transformations are handled, and frontiers should
// contain the same set of IDs as logical. We still need to do a final
// permutation so that their orders are also consistent.
std::unordered_map<IterDomain*, int64_t> current_dims;
int64_t counter = 0;
for (auto id : frontier) {
current_dims[id] = counter++;
}
std::vector<int64_t> dims;
dims.reserve(frontier.size());
for (auto id : logical) {
dims.emplace_back(current_dims.at(id));
}
return tensor.permute(dims);
}

at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv) {
FUSER_PERF_SCOPE("allocations::transformLogicalToAllocation");
// Ignore reductions because reduction dimensions are not allocated in
// `tensor`.
auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());

std::vector<int64_t> permutation =
*ir_utils::computePermutation(logical, alloc);
return tensor.permute(permutation);
}

} // namespace nvfuser
16 changes: 16 additions & 0 deletions csrc/runtime/allocations.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,20 @@ std::vector<GlobalBufferInfo> getBufferInfos(
DataType index_dtype,
const std::vector<Val*>& fusion_outputs);

// Start from a tensor whose dimensions are consistent with the allocation
// domain of tv, apply a sequence of view/permute to the tensor to transform it
// into a format whose dimensions are consistent with the logical domain of tv.
// For example, if the logical domain is [I1, I2], and the allocation domain is
// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t()
// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1].
// Another example, if the logical domain is [I1*I2] and the allocation domain
// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
// get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
at::Tensor transformFromAllocationToLogical(
at::Tensor tensor,
TensorView* tv,
const ExpressionEvaluator& ee);

at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv);

} // namespace nvfuser
29 changes: 24 additions & 5 deletions csrc/runtime/fusion_executor_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
most_recent_runtime_ = kernel_runtime;

auto fusion = kernel_runtime->fusionSegments()->completeFusion();
ExpressionEvaluator evaluator = executor_utils::bindInputs(args, fusion);

// Make sure the forced index type is indeed used
if (forced_index_type.has_value()) {
Expand All @@ -79,16 +80,22 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
}

auto outputs = kernel_runtime->runWithInputs(args);
NVF_ERROR(fusion->outputs().size() == outputs.size());

// Kernel time measurement is off by default
kernel_runtime->disableKernelTimeMeasurement();

for (const auto out_index : c10::irange(outputs.size())) {
at::Tensor& output = outputs[out_index];
auto* out = fusion->outputs()[out_index]->as<TensorView>();
output = transformFromAllocationToLogical(output, out, evaluator);
}

// Removing aliased outputs, since those are updated by the Fusion. It is not
// semantically correct to actually return them as outputs from
// fusion.
NVF_ERROR(fusion->outputs().size() == outputs.size());
size_t new_size = 0;
for (size_t out_index = 0; out_index < outputs.size(); out_index++) {
for (const auto out_index : c10::irange(outputs.size())) {
Val* out = fusion->outputs()[out_index];
if (!fusion->getOutputAlias(out).hide_output) {
outputs[new_size] = outputs[out_index];
Expand All @@ -113,8 +120,20 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
std::optional<int8_t> selected_device) {
FUSER_PERF_SCOPE("FusionExecutorCache::prepareInputs");

KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs, selected_device);
std::vector<c10::IValue> inputs_matching_allocation;
inputs_matching_allocation.reserve(inputs.size());
for (const auto i : c10::irange(inputs.size())) {
const auto& input = inputs[i];
if (!input.isTensor()) {
inputs_matching_allocation.push_back(input);
continue;
}
inputs_matching_allocation.push_back(transformFromLogicalToAllocation(
input.toTensor(), fusion_->inputs()[i]->as<TensorView>()));
}

KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(
inputs_matching_allocation, selected_device);

// TODO: move InputsIdLookup inside KernelArgumentHolder;
// NOTE: We must ensure that the cache id is in fact unique. Dynamic fusions
Expand All @@ -125,7 +144,7 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
// short-circuiting here, resulting in avoidable rebuilds of concretization
// info.
auto id_lookup_ret = inputs_id_lookup_.lookupId(
inputs,
inputs_matching_allocation,
initialInfo().scalarInputsAffectingConcretization(),
args.getDeviceIndex());
if (id_lookup_ret.eviction) {
Expand Down
Loading
Loading