Permute the inputs/outputs of runFusionWithInputs.

NVIDIA · Nov 5, 2024 · ceba5c4 · ceba5c4
1 parent e8a3608
commit ceba5c4
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 49 deletions.
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
@@ -10,6 +10,7 @@
 
 #include <expr_evaluator.h>
 #include <instrumentation.h>
+#include <ir/utils.h>
 #include <polymorphic_value.h>
 #include <runtime/executor_kernel_arg.h>
 #include <runtime/executor_utils.h>
@@ -676,50 +677,6 @@ class BackwardTraverseFromAllocToLogical {
   }
 };
 
-// Start from a tensor whose dimensions are consistent with the allocation
-// domain of tv, apply a sequence of view/permute to the tensor to transform it
-// into a format whose dimensions are consistent with the logical domain of tv.
-// For example, if the logical domain is [I1, I2], and the allocation domain is
-// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t()
-// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1].
-// Another example, if the logical domain is [I1*I2] and the allocation domain
-// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
-// get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
-at::Tensor transformOutputFromAllocationToLogical(
-    at::Tensor tensor,
-    TensorView* tv,
-    ExpressionEvaluator& ee) {
-  FUSER_PERF_SCOPE(
-      "fusion_executor::allocations::transformOutputFromAllocationToLogical");
-  // Ignore reductions because reductions does not exist in tensor's definition
-  auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
-  auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
-  // Traverse all affine transformations from allocation domain. Because
-  // allocation domain can be before or after the logical domain, we need both a
-  // forward and a backward traverse.
-  std::list<IterDomain*> frontier(alloc.begin(), alloc.end());
-  NVF_ERROR(tensor.dim() == (int64_t)frontier.size());
-  tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier)
-               .run(logical, alloc);
-  tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier)
-               .run(logical, alloc);
-  NVF_ERROR(frontier.size() == logical.size());
-  // Now that all affine transformations are handled, and frontiers should
-  // contain the same set of IDs as logical. We still need to do a final
-  // permutation so that their orders are also consistent.
-  std::unordered_map<IterDomain*, int64_t> current_dims;
-  int64_t counter = 0;
-  for (auto id : frontier) {
-    current_dims[id] = counter++;
-  }
-  std::vector<int64_t> dims;
-  dims.reserve(frontier.size());
-  for (auto id : logical) {
-    dims.emplace_back(current_dims.at(id));
-  }
-  return tensor.permute(dims);
-}
-
 } // namespace
 
 std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
@@ -766,4 +723,50 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
   return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
 }
 
+at::Tensor transformOutputFromAllocationToLogical(
+    at::Tensor tensor,
+    TensorView* tv,
+    ExpressionEvaluator& ee) {
+  FUSER_PERF_SCOPE("allocations::transformOutputFromAllocationToLogical");
+  // Ignore reductions because reductions does not exist in tensor's definition
+  auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
+  auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
+  // Traverse all affine transformations from allocation domain. Because
+  // allocation domain can be before or after the logical domain, we need both a
+  // forward and a backward traverse.
+  std::list<IterDomain*> frontier(alloc.begin(), alloc.end());
+  NVF_ERROR(tensor.dim() == (int64_t)frontier.size());
+  tensor = ForwardTraverseFromAllocToLogical(tensor, ee, frontier)
+               .run(logical, alloc);
+  tensor = BackwardTraverseFromAllocToLogical(tensor, ee, frontier)
+               .run(logical, alloc);
+  NVF_ERROR(frontier.size() == logical.size());
+  // Now that all affine transformations are handled, and frontiers should
+  // contain the same set of IDs as logical. We still need to do a final
+  // permutation so that their orders are also consistent.
+  std::unordered_map<IterDomain*, int64_t> current_dims;
+  int64_t counter = 0;
+  for (auto id : frontier) {
+    current_dims[id] = counter++;
+  }
+  std::vector<int64_t> dims;
+  dims.reserve(frontier.size());
+  for (auto id : logical) {
+    dims.emplace_back(current_dims.at(id));
+  }
+  return tensor.permute(dims);
+}
+
+at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv) {
+  FUSER_PERF_SCOPE("allocations::transformLogicalToAllocation");
+  // Ignore reductions because reduction dimensions are not allocated in
+  // `tensor`.
+  auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
+  auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
+
+  std::vector<int64_t> permutation =
+      *ir_utils::computePermutation(logical, alloc);
+  return tensor.permute(permutation);
+}
+
 } // namespace nvfuser
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
@@ -85,4 +85,20 @@ std::vector<GlobalBufferInfo> getBufferInfos(
     DataType index_dtype,
     const std::vector<Val*>& fusion_outputs);
 
+// Start from a tensor whose dimensions are consistent with the allocation
+// domain of tv, apply a sequence of view/permute to the tensor to transform it
+// into a format whose dimensions are consistent with the logical domain of tv.
+// For example, if the logical domain is [I1, I2], and the allocation domain is
+// [I2*I1], then we will allocate as [I2*I1], then do a tensor.view(I2, I1).t()
+// to get a tensor whose semantics is [I1, I2] but its memory is [I2*I1].
+// Another example, if the logical domain is [I1*I2] and the allocation domain
+// is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
+// get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
+at::Tensor transformOutputFromAllocationToLogical(
+    at::Tensor tensor,
+    TensorView* tv,
+    ExpressionEvaluator& ee);
+
+at::Tensor transformFromLogicalToAllocation(at::Tensor tensor, TensorView* tv);
+
 } // namespace nvfuser
diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp
@@ -84,22 +84,30 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
   int seq_id = 0;
   // Record kernel input and output tensors so profiler can construct
   // the data flow graph
+  // FIXME: the inputs should be changed to match allocation.
   RECORD_FUNCTION(
       "run_fused_kernel",
       std::vector<c10::IValue>(inputs.begin(), inputs.end()),
       seq_id);
   auto outputs = kernel_runtime->runWithInputs(args);
   RECORD_OUTPUTS(outputs);
+  NVF_ERROR(fusion->outputs().size() == outputs.size());
 
   // Kernel time measurement is off by default
   kernel_runtime->disableKernelTimeMeasurement();
 
+  ExpressionEvaluator evaluator;
+  for (const auto out_index : c10::irange(outputs.size())) {
+    at::Tensor& output = outputs[out_index];
+    auto* out = fusion->outputs()[out_index]->as<TensorView>();
+    output = transformOutputFromAllocationToLogical(output, out, evaluator);
+  }
+
   // Removing aliased outputs, since those are updated by the Fusion. It is not
   // semantically correct to actually return them as outputs from
   // fusion.
-  NVF_ERROR(fusion->outputs().size() == outputs.size());
   size_t new_size = 0;
-  for (size_t out_index = 0; out_index < outputs.size(); out_index++) {
+  for (const auto out_index : c10::irange(outputs.size())) {
     Val* out = fusion->outputs()[out_index];
     if (!fusion->getOutputAlias(out).hide_output) {
       outputs[new_size] = outputs[out_index];
@@ -124,8 +132,20 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
     std::optional<int8_t> selected_device) {
   FUSER_PERF_SCOPE("FusionExecutorCache::prepareInputs");
 
-  KernelArgumentHolder args =
-      KernelArgumentHolder::createKernelArgumentHolder(inputs, selected_device);
+  std::vector<c10::IValue> inputs_matching_allocation;
+  inputs_matching_allocation.reserve(inputs.size());
+  for (const auto i : c10::irange(inputs.size())) {
+    const auto& input = inputs[i];
+    if (!input.isTensor()) {
+      inputs_matching_allocation.push_back(input);
+      continue;
+    }
+    inputs_matching_allocation.push_back(transformFromLogicalToAllocation(
+        input.toTensor(), fusion_->inputs()[i]->as<TensorView>()));
+  }
+
+  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(
+      inputs_matching_allocation, selected_device);
 
   // TODO: move InputsIdLookup inside KernelArgumentHolder;
   // NOTE: We must ensure that the cache id is in fact unique. Dynamic fusions
@@ -136,7 +156,7 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
   // short-circuiting here, resulting in avoidable rebuilds of concretization
   // info.
   auto id_lookup_ret = inputs_id_lookup_.lookupId(
-      inputs,
+      inputs_matching_allocation,
       initialInfo().scalarInputsAffectingConcretization(),
       args.getDeviceIndex());
   if (id_lookup_ret.eviction) {