diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 2e6fa250c13..0b4557abebe 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -676,50 +677,6 @@ class BackwardTraverseFromAllocToLogical { } }; -// Start from a tensor whose dimensions are consistent with the allocation -// domain of tv, apply a sequence of view/permute to the tensor to transform it -// into a format whose dimensions are consistent with the logical domain of tv. -// For example, if the logical domain is [I1, I2], and the allocation domain is -// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t() -// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1]. -// Another example, if the logical domain is [I1*I2] and the allocation domain -// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to -// get a tensor whose semantics is [I1*I2] but memory is [I1,I2] -at::Tensor transformOutputFromAllocationToLogical( - at::Tensor tensor, - TensorView* tv, - ExpressionEvaluator& ee) { - FUSER_PERF_SCOPE( - "fusion_executor::allocations::transformOutputFromAllocationToLogical"); - // Ignore reductions because reductions does not exist in tensor's definition - auto logical = TensorDomain::noReductions(tv->getLogicalDomain()); - auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain()); - // Traverse all affine transformations from allocation domain. Because - // allocation domain can be before or after the logical domain, we need both a - // forward and a backward traverse. - std::list frontier(alloc.begin(), alloc.end()); - NVF_ERROR(tensor.dim() == (int64_t)frontier.size()); - tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier) - .run(logical, alloc); - tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier) - .run(logical, alloc); - NVF_ERROR(frontier.size() == logical.size()); - // Now that all affine transformations are handled, and frontiers should - // contain the same set of IDs as logical. We still need to do a final - // permutation so that their orders are also consistent. - std::unordered_map current_dims; - int64_t counter = 0; - for (auto id : frontier) { - current_dims[id] = counter++; - } - std::vector dims; - dims.reserve(frontier.size()); - for (auto id : logical) { - dims.emplace_back(current_dims.at(id)); - } - return tensor.permute(dims); -} - } // namespace std::pair, std::vector> inferShapeOfOutput( @@ -766,4 +723,50 @@ std::pair, std::vector> inferShapeOfOutput( return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()}; } +at::Tensor transformOutputFromAllocationToLogical( + at::Tensor tensor, + TensorView* tv, + ExpressionEvaluator& ee) { + FUSER_PERF_SCOPE("allocations::transformOutputFromAllocationToLogical"); + // Ignore reductions because reductions does not exist in tensor's definition + auto logical = TensorDomain::noReductions(tv->getLogicalDomain()); + auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain()); + // Traverse all affine transformations from allocation domain. Because + // allocation domain can be before or after the logical domain, we need both a + // forward and a backward traverse. + std::list frontier(alloc.begin(), alloc.end()); + NVF_ERROR(tensor.dim() == (int64_t)frontier.size()); + tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier) + .run(logical, alloc); + tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier) + .run(logical, alloc); + NVF_ERROR(frontier.size() == logical.size()); + // Now that all affine transformations are handled, and frontiers should + // contain the same set of IDs as logical. We still need to do a final + // permutation so that their orders are also consistent. + std::unordered_map current_dims; + int64_t counter = 0; + for (auto id : frontier) { + current_dims[id] = counter++; + } + std::vector dims; + dims.reserve(frontier.size()); + for (auto id : logical) { + dims.emplace_back(current_dims.at(id)); + } + return tensor.permute(dims); +} + +at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv) { + FUSER_PERF_SCOPE("allocations::transformLogicalToAllocation"); + // Ignore reductions because reduction dimensions are not allocated in + // `tensor`. + auto logical = TensorDomain::noReductions(tv->getLogicalDomain()); + auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain()); + + std::vector permutation = + *ir_utils::computePermutation(logical, alloc); + return tensor.permute(permutation); +} + } // namespace nvfuser diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 294013f4e1a..b2cbea48b59 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -85,4 +85,20 @@ std::vector getBufferInfos( DataType index_dtype, const std::vector& fusion_outputs); +// Start from a tensor whose dimensions are consistent with the allocation +// domain of tv, apply a sequence of view/permute to the tensor to transform it +// into a format whose dimensions are consistent with the logical domain of tv. +// For example, if the logical domain is [I1, I2], and the allocation domain is +// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t() +// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1]. +// Another example, if the logical domain is [I1*I2] and the allocation domain +// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to +// get a tensor whose semantics is [I1*I2] but memory is [I1,I2] +at::Tensor transformOutputFromAllocationToLogical( + at::Tensor tensor, + TensorView* tv, + ExpressionEvaluator& ee); + +at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv); + } // namespace nvfuser diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp index a3b8de148e2..6ce28a7488f 100644 --- a/csrc/runtime/fusion_executor_cache.cpp +++ b/csrc/runtime/fusion_executor_cache.cpp @@ -84,22 +84,30 @@ std::vector FusionExecutorCache::runFusionWithInputs( int seq_id = 0; // Record kernel input and output tensors so profiler can construct // the data flow graph + // FIXME: the inputs should be changed to match allocation. RECORD_FUNCTION( "run_fused_kernel", std::vector(inputs.begin(), inputs.end()), seq_id); auto outputs = kernel_runtime->runWithInputs(args); RECORD_OUTPUTS(outputs); + NVF_ERROR(fusion->outputs().size() == outputs.size()); // Kernel time measurement is off by default kernel_runtime->disableKernelTimeMeasurement(); + ExpressionEvaluator evaluator; + for (const auto out_index : c10::irange(outputs.size())) { + at::Tensor& output = outputs[out_index]; + auto* out = fusion->outputs()[out_index]->as(); + output = transformOutputFromAllocationToLogical(output, out, evaluator); + } + // Removing aliased outputs, since those are updated by the Fusion. It is not // semantically correct to actually return them as outputs from // fusion. - NVF_ERROR(fusion->outputs().size() == outputs.size()); size_t new_size = 0; - for (size_t out_index = 0; out_index < outputs.size(); out_index++) { + for (const auto out_index : c10::irange(outputs.size())) { Val* out = fusion->outputs()[out_index]; if (!fusion->getOutputAlias(out).hide_output) { outputs[new_size] = outputs[out_index]; @@ -124,8 +132,20 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs( std::optional selected_device) { FUSER_PERF_SCOPE("FusionExecutorCache::prepareInputs"); - KernelArgumentHolder args = - KernelArgumentHolder::createKernelArgumentHolder(inputs, selected_device); + std::vector inputs_matching_allocation; + inputs_matching_allocation.reserve(inputs.size()); + for (const auto i : c10::irange(inputs.size())) { + const auto& input = inputs[i]; + if (!input.isTensor()) { + inputs_matching_allocation.push_back(input); + continue; + } + inputs_matching_allocation.push_back(transformFromLogicalToAllocation( + input.toTensor(), fusion_->inputs()[i]->as())); + } + + KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder( + inputs_matching_allocation, selected_device); // TODO: move InputsIdLookup inside KernelArgumentHolder; // NOTE: We must ensure that the cache id is in fact unique. Dynamic fusions @@ -136,7 +156,7 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs( // short-circuiting here, resulting in avoidable rebuilds of concretization // info. auto id_lookup_ret = inputs_id_lookup_.lookupId( - inputs, + inputs_matching_allocation, initialInfo().scalarInputsAffectingConcretization(), args.getDeviceIndex()); if (id_lookup_ret.eviction) {