Profile configurations for InnerOuterPersistent scheduler in python f…

…rontend (#3118) # Summary This PR explores auto-tuning a `LayerNormBackward` fusion using the `InnerOuterPersistent` scheduler in the python-frontend. - Create `autotune_persistent.py` to test several parameter configurations then apply `DecisionTreeRegressor` - The selected performance metric is `effective_bandwidth_gbs`. The empirical scheduler selects the configuration that has the highest predicted `effective_bandwidth_gbs`. # Key differences from approach for `Pointwise` scheduler - `vectorize_factor`, `thread_per_block_min`, and `thread_per_block_max` are specified before running `computeHeuristics`. These settings are akin to hyper-parameters used to constrain the generated scheduler parameters. - Create `SchedulerHyperParameters` as an entry in `HeuristicDataCache` to specify these constraints when generating scheduler parameters. # Details 1. Create `struct SchedulerHyperParameters` in `csrc/scheduler/utils.h` 2. Create `HeuristicDataCacheEntry` in `csrc/scheduler/compile_time_info.h` 3. Modify `computeHeuristics` to use hyper-parameter constraints. 4. Expose `SchedulerHyperParameters` in python frontend. 5. Allow user schedulers to define a `HeuristicDataCache` during scheduling. * `ScheduleHyperParameters` contains parameters for `vectorize_factor`, `unroll_factor`, `threads_per_block_min`, and `threads_per_block_max`.
NVIDIA · Oct 30, 2024 · c14d418 · c14d418
1 parent 7220207
commit c14d418
Show file tree

Hide file tree

Showing 9 changed files with 590 additions and 18 deletions.
diff --git a/csrc/python_frontend/fusion_cache.cpp b/csrc/python_frontend/fusion_cache.cpp
@@ -227,7 +227,8 @@ HeuristicParams* UserSchedule::computeHeuristics(SchedulerType scheduler_type) {
   NVF_CHECK(
       heuristic_params == nullptr,
       "Heuristic Scheduler is already defined for this UserSchedule");
-  heuristic_params = scheduler->computeHeuristics(fusion(), runtime_info_ref);
+  heuristic_params = scheduler->computeHeuristics(
+      fusion(), runtime_info_ref, data_cache.get());
   return heuristic_params.get();
 }
 

diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h
@@ -11,6 +11,7 @@
 
 #include <python_frontend/fusion_record.h>
 #include <runtime/fusion_executor_cache.h>
+#include <scheduler/compile_time_info.h>
 #include <scheduler/registry.h>
 
 #include <memory>
@@ -33,6 +34,9 @@ struct UserSchedule {
   //! The parameters for scheduler heuristic.
   std::unique_ptr<HeuristicParams> heuristic_params;
 
+  //! The compile-time data cache.
+  std::unique_ptr<HeuristicDataCache> data_cache;
+
   //! Concretized, Scheduled Fusion IR
   std::unique_ptr<Fusion> scheduled_fusion;
 

diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
@@ -13,6 +13,7 @@
 #include <python_frontend/fusion_definition.h>
 #include <python_frontend/translation.h>
 #include <runtime/executor_kernel_arg.h>
+#include <scheduler/compile_time_info.h>
 #include <scheduler/scheduler_types.h>
 #include <utils.h>
 #include <validator_utils.h>
@@ -239,6 +240,9 @@ void FusionDefinition::setupSchedule(
   user_sched_ = fusionCache()->createUserSchedule(
       scheds, inputs, device, overwrite_existing_schedule);
 
+  // Create scheduler data cache
+  user_sched_->data_cache = std::make_unique<HeuristicDataCache>();
+
   // Building a new Fusion container for scheduling with definition such that
   // the definition's tensor data members refer to the corresponding IR objects
   // needed for scheduling. A simple copy of the container would mean the data

diff --git a/csrc/python_frontend/python_bindings.cpp b/csrc/python_frontend/python_bindings.cpp
@@ -23,9 +23,11 @@
 #include <python_frontend/python_bindings.h>
 #include <python_frontend/translation.h>
 #include <runtime/fusion_kernel_runtime.h>
+#include <scheduler/compile_time_info.h>
 #include <scheduler/registry.h>
 #include <scheduler/scheduler_types.h>
 #include <scheduler/tools/inlining.h>
+#include <scheduler/utils.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <transform_replay.h>
 #include <iostream>
@@ -779,6 +781,44 @@ void initNvFuserPythonBindings(PyObject* module) {
 
   defineHeuristicParamBindings(nvfuser);
 
+  py::class_<scheduler_utils::SchedulerHyperParameters> hyperparameters(
+      nvfuser, "SchedulerHyperParameters");
+  hyperparameters.def(py::init<int64_t, int64_t, int64_t, int64_t>());
+  hyperparameters.def_property(
+      "vectorize_factor",
+      [](scheduler_utils::SchedulerHyperParameters& self) {
+        return self.vectorize_factor;
+      },
+      [](scheduler_utils::SchedulerHyperParameters& self,
+         int64_t vectorize_factor_) {
+        self.vectorize_factor = vectorize_factor_;
+      });
+  hyperparameters.def_property(
+      "unroll_factor",
+      [](scheduler_utils::SchedulerHyperParameters& self) {
+        return self.unroll_factor;
+      },
+      [](scheduler_utils::SchedulerHyperParameters& self,
+         int64_t unroll_factor_) { self.unroll_factor = unroll_factor_; });
+  hyperparameters.def_property(
+      "threads_per_block_min",
+      [](scheduler_utils::SchedulerHyperParameters& self) {
+        return self.threads_per_block_min;
+      },
+      [](scheduler_utils::SchedulerHyperParameters& self,
+         int64_t threads_per_block_min_) {
+        self.threads_per_block_min = threads_per_block_min_;
+      });
+  hyperparameters.def_property(
+      "threads_per_block_max",
+      [](scheduler_utils::SchedulerHyperParameters& self) {
+        return self.threads_per_block_max;
+      },
+      [](scheduler_utils::SchedulerHyperParameters& self,
+         int64_t threads_per_block_max_) {
+        self.threads_per_block_max = threads_per_block_max_;
+      });
+
   //! KernelProfiles are encapsulated in FusionProfiles where each KP
   //! is associated with a segment.
   py::class_<KernelProfile> kernel_prof(nvfuser, "KernelProfile");
@@ -1401,7 +1441,7 @@ void initNvFuserPythonBindings(PyObject* module) {
   py::class_<FusionDefinition::Operators> nvf_ops(fusion_def, "Operators");
   nvf_ops.def(py::init<FusionDefinition*>());
 
-  // ******************** INSERT OP BINDINGS BELOW HERE ********************
+// ******************** INSERT OP BINDINGS BELOW HERE ********************
 #define OP_PREFIX "Operators."
 #define NVFUSER_PYTHON_BINDING_UNARY_OP(op_str, op_name)                      \
   nvf_ops.def(                                                                \
@@ -3822,6 +3862,27 @@ void initNvFuserPythonBindings(PyObject* module) {
         return *parameters->as<MatmulParams>();
       },
       py::return_value_policy::reference);
+  nvf_sched.def(
+      "schedule_hyperparameters",
+      [](FusionDefinition::SchedOperators& self)
+          -> scheduler_utils::SchedulerHyperParameters& {
+        NVF_CHECK(
+            self.validUse(),
+            "Attempting to use a SchedOperators Op prior to definition!");
+        UserSchedule* sched = self.fusion_definition->userSchedule();
+        auto scheduler_hyperparameters_entry = HeuristicDataCacheEntry<
+            HeuristicCompileTime::SchedulerHyperParameters>(
+            sched->data_cache.get(), []() {
+              return std::make_unique<
+                  scheduler_utils::SchedulerHyperParameters>(
+                  /*vectorize_factor=*/1,
+                  /*unroll_factor=*/1,
+                  /*threads_per_block_min=*/1,
+                  /*threads_per_block_max=*/1);
+            });
+        return scheduler_hyperparameters_entry.get();
+      },
+      py::return_value_policy::reference);
 }
 
 void cleanup() {

diff --git a/csrc/scheduler/compile_time_info.h b/csrc/scheduler/compile_time_info.h
@@ -46,7 +46,8 @@ enum class CompileTimeEntryType {
   CAN_SCHEDULE_TRANSPOSE,
   CAN_SCHEDULE_MUL_SUM_AS_MMA,
   LOGICAL_REORDER_MAP,
-  VECTORIZATION_BREAK_POINT_OF_RED_PROD
+  VECTORIZATION_BREAK_POINT_OF_RED_PROD,
+  SCHEDULE_HYPERPARAMETERS
 };
 
 //! Entry type definition class for `DOMAIN_MAP`,
@@ -203,6 +204,15 @@ class VectorizationBreakPointOfReductionProducer {
       CompileTimeEntryType::VECTORIZATION_BREAK_POINT_OF_RED_PROD;
 };
 
+//! Entry type definition class for `SCHEDULE_HYPERPARAMETERS`,
+//!  stores hyperparameters for SchedulerEntry::computeHeuristics
+class SchedulerHyperParameters {
+ public:
+  using DataType = scheduler_utils::SchedulerHyperParameters;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::SCHEDULE_HYPERPARAMETERS;
+};
+
 //! Base abstract class for unified storage in `HeuristicDataCache`,
 //!  each entry in `HeuristicDataCache` will be a subclass.
 class CompileTimeInfoBase : public PolymorphicBase {

diff --git a/csrc/scheduler/normalization_inner_outer.cpp b/csrc/scheduler/normalization_inner_outer.cpp
@@ -186,7 +186,9 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
     SchedulerRuntimeInfo& runtime_info,
     HeuristicDataCache* data_cache,
     const std::vector<TensorView*>& reduction_tvs,
-    const int64_t vectorize_factor) {
+    const int64_t vectorize_factor,
+    const int64_t threads_per_block_min,
+    const int64_t threads_per_block_max) {
   FUSER_PERF_SCOPE(
       "normalization_inner_outer::getPersistentBufferStorageParams");
 
@@ -230,9 +232,7 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
 
   const auto dev_prop = at::cuda::getCurrentDeviceProperties();
   int64_t smem_overhead = scheduler_utils::getSharedMemoryOverheadPerBlock(
-      fusion,
-      reduction_tvs,
-      InnerOuterPersistentKernelScheduler::threads_per_block_max);
+      fusion, reduction_tvs, threads_per_block_max);
   int64_t available_smem =
       (int64_t)dev_prop->sharedMemPerMultiprocessor - smem_overhead;
   int64_t available_regs = scheduler_utils::register_file_size_56k;
@@ -281,8 +281,8 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
         tv_buffer_size_regs,
         dataTypeSize(current_tv->getDataType().value()),
         vectorize_factor,
-        InnerOuterPersistentKernelScheduler::threads_per_block_min,
-        InnerOuterPersistentKernelScheduler::threads_per_block_max,
+        threads_per_block_min,
+        threads_per_block_max,
         dev_prop->warpSize);
     buffer_params.smem_buffer_size += tv_buffer_size_smem;
 
@@ -332,6 +332,8 @@ std::pair<int64_t, int64_t> getBufferBatchSizeAndThreadsPerBlock(
     const int64_t outer_dim_numel,
     const int64_t persistent_buffer_size,
     const int64_t vectorize_factor,
+    const int64_t threads_per_block_min,
+    const int64_t threads_per_block_max,
     const int64_t warp_size) {
   // if inner_dim_numel <= 1024, we are doing multiple reductions per block
   // with a constant batch size of 1 if vectorized. See Step 5 of
@@ -380,19 +382,16 @@ std::pair<int64_t, int64_t> getBufferBatchSizeAndThreadsPerBlock(
   };
 
   const int64_t after_vectorization = inner_dim_numel / vectorize_factor;
-  const int64_t threads_per_block_min = std::min(
-      after_vectorization,
-      InnerOuterPersistentKernelScheduler::threads_per_block_min);
-  const int64_t threads_per_block_max =
-      InnerOuterPersistentKernelScheduler::threads_per_block_max;
+  const int64_t threads_per_block_min_after_vectorization =
+      std::min(after_vectorization, threads_per_block_min);
   const int64_t batch_min = getMinimumBatch();
   const int64_t batch_max = getMaximumInnerOuterPersistentBufferBatch();
 
   // Start from the smallest threads_per_block. If the corresponding batch size
   // is larger than batch_max, try increase threads per block by a warp until
   // the threads_per_block reaches threads_per_block_max or the batch size
   // reaches batch_min.
-  int64_t threads_per_block = threads_per_block_min;
+  int64_t threads_per_block = threads_per_block_min_after_vectorization;
   int64_t inner_batch = ceilDiv(after_vectorization, threads_per_block);
   while (inner_batch > batch_max &&
          threads_per_block + warp_size <= threads_per_block_max &&
@@ -432,6 +431,8 @@ std::unique_ptr<ReductionParams> innerOuterPersistentHeuristic(
     const int64_t smem_overhead,
     const size_t tmp_gmem_dtype_size,
     const size_t vectorize_factor,
+    const int64_t threads_per_block_min,
+    const int64_t threads_per_block_max,
     const bool project_to_input,
     const PrimDataType index_type) {
   auto rparams = std::make_unique<ReductionParams>(
@@ -512,6 +513,8 @@ std::unique_ptr<ReductionParams> innerOuterPersistentHeuristic(
           outer_dim_numel,
           regs_buffer_size,
           iop.inner_vect,
+          threads_per_block_min,
+          threads_per_block_max,
           dev_prop->warpSize);
   iop.inner_batch = persistent_batch;
 
@@ -743,12 +746,32 @@ std::unique_ptr<ReductionParams> getInnerOuterPersistentHeuristics(
                 scheduler_utils::persistentBuffers(fusion));
           });
 
+  auto scheduler_hyperparameters_entry =
+      HeuristicDataCacheEntry<HeuristicCompileTime::SchedulerHyperParameters>(
+          data_cache, [&]() {
+            return std::make_unique<scheduler_utils::SchedulerHyperParameters>(
+                /*vectorize_factor=*/vectorize_factor,
+                /*unroll_factor=*/1,
+                /*threads_per_block_min=*/
+                InnerOuterPersistentKernelScheduler::threads_per_block_min,
+                /*threads_per_block_max=*/
+                InnerOuterPersistentKernelScheduler::threads_per_block_max);
+          });
+  scheduler_utils::SchedulerHyperParameters& hp =
+      scheduler_hyperparameters_entry.get();
+
   auto& persistent_buffer_info = persistent_buffer_info_entry.get();
   NVF_ERROR(
       !persistent_buffer_info.persistent_buffers.empty(),
       "Persistent scheduler requires persistent buffers.");
   auto buffer_params = getPersistentBufferStorageParams(
-      fusion, runtime_info, data_cache, reduction_tvs, vectorize_factor);
+      fusion,
+      runtime_info,
+      data_cache,
+      reduction_tvs,
+      hp.vectorize_factor,
+      hp.threads_per_block_min,
+      hp.threads_per_block_max);
 
   std::unique_ptr<ReductionParams> rparams = innerOuterPersistentHeuristic(
       properties.total_iteration_numel,
@@ -757,7 +780,9 @@ std::unique_ptr<ReductionParams> getInnerOuterPersistentHeuristics(
       buffer_params.smem_buffer_size,
       buffer_params.smem_overhead,
       max_outer_reduction_dtype_size,
-      vectorize_factor,
+      hp.vectorize_factor,
+      hp.threads_per_block_min,
+      hp.threads_per_block_max,
       buffer_params.project_to_input,
       runtime_info.getIndexType());
 
@@ -1244,9 +1269,29 @@ bool InnerOuterPersistentKernelScheduler::canScheduleRunTime(
       data_cache,
       (int)(reduced_tv->nDims() - properties.inner_most_dimension_ndims));
 
+  auto scheduler_hyperparameters_entry =
+      HeuristicDataCacheEntry<HeuristicCompileTime::SchedulerHyperParameters>(
+          data_cache, [&]() {
+            return std::make_unique<scheduler_utils::SchedulerHyperParameters>(
+                /*vectorize_factor=*/vectorize_factor,
+                /*unroll_factor=*/1,
+                /*threads_per_block_min=*/
+                InnerOuterPersistentKernelScheduler::threads_per_block_min,
+                /*threads_per_block_max=*/
+                InnerOuterPersistentKernelScheduler::threads_per_block_max);
+          });
+  scheduler_utils::SchedulerHyperParameters& hp =
+      scheduler_hyperparameters_entry.get();
+
   // check if there is enough register and shared memory for persistence
   const auto buffer_params = getPersistentBufferStorageParams(
-      fusion, runtime_info, data_cache, reduction_tvs, vectorize_factor);
+      fusion,
+      runtime_info,
+      data_cache,
+      reduction_tvs,
+      hp.vectorize_factor,
+      hp.threads_per_block_min,
+      hp.threads_per_block_max);
 
   const int64_t device_multiprocessor_count =
       (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
@@ -224,4 +224,6 @@ template class HeuristicDataCacheEntry<
 template class HeuristicDataCacheEntry<HeuristicCompileTime::LogicalReorderMap>;
 template class HeuristicDataCacheEntry<
     HeuristicCompileTime::VectorizationBreakPointOfReductionProducer>;
+template class HeuristicDataCacheEntry<
+    HeuristicCompileTime::SchedulerHyperParameters>;
 } // namespace nvfuser
diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
@@ -173,6 +173,34 @@ inline void parallelizeAllLike(
       propagate_padding);
 }
 
+// Common hyperparameters used in heuristic scheduler. These hyperparameters
+// are passed to SchedulerEntry::computeHeuristics through the
+// HeuristicDataCache. These hyperparameters alter the generation of the
+// HeuristicParams for the scheduler.
+struct SchedulerHyperParameters {
+  SchedulerHyperParameters(
+      int64_t vectorize_factor_,
+      int64_t unroll_factor_,
+      int64_t threads_per_block_min_,
+      int64_t threads_per_block_max_)
+      : vectorize_factor(vectorize_factor_),
+        unroll_factor(unroll_factor_),
+        threads_per_block_min(threads_per_block_min_),
+        threads_per_block_max(threads_per_block_max_) {}
+
+  //! Number of elements to load per vectorize load.
+  int64_t vectorize_factor = 1;
+
+  //! Number of iterations to unroll for-loop.
+  int64_t unroll_factor = 1;
+
+  //! Minimum number of threads per block.
+  int64_t threads_per_block_min = 1;
+
+  //! Maximum number of threads per block.
+  int64_t threads_per_block_max = 1;
+};
+
 struct PersistentBufferInfo {
   std::vector<TensorView*> persistent_buffers;
   std::unordered_set<IterDomain*> unmappable_dims;