Skip to content

Commit

Permalink
Profile configurations for InnerOuterPersistent scheduler in python f…
Browse files Browse the repository at this point in the history
…rontend (#3118)

# Summary
This PR explores auto-tuning a `LayerNormBackward` fusion using the
`InnerOuterPersistent` scheduler in the python-frontend.

- Create `autotune_persistent.py` to test several parameter
configurations then apply `DecisionTreeRegressor`
- The selected performance metric is `effective_bandwidth_gbs`. The
empirical scheduler selects the configuration that has the highest
predicted `effective_bandwidth_gbs`.

# Key differences from approach for `Pointwise` scheduler
- `vectorize_factor`, `thread_per_block_min`, and `thread_per_block_max`
are specified before running `computeHeuristics`. These settings are
akin to hyper-parameters used to constrain the generated scheduler
parameters.
- Create `SchedulerHyperParameters` as an entry in `HeuristicDataCache`
to specify these constraints when generating scheduler parameters.

# Details
1. Create `struct SchedulerHyperParameters` in `csrc/scheduler/utils.h`
2. Create `HeuristicDataCacheEntry` in
`csrc/scheduler/compile_time_info.h`
3. Modify `computeHeuristics` to use hyper-parameter constraints.
4. Expose `SchedulerHyperParameters` in python frontend.
5. Allow user schedulers to define a `HeuristicDataCache` during
scheduling.

* `ScheduleHyperParameters` contains parameters for `vectorize_factor`,
`unroll_factor`, `threads_per_block_min`, and `threads_per_block_max`.
  • Loading branch information
rdspring1 authored Oct 30, 2024
1 parent 7220207 commit c14d418
Show file tree
Hide file tree
Showing 9 changed files with 590 additions and 18 deletions.
3 changes: 2 additions & 1 deletion csrc/python_frontend/fusion_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,8 @@ HeuristicParams* UserSchedule::computeHeuristics(SchedulerType scheduler_type) {
NVF_CHECK(
heuristic_params == nullptr,
"Heuristic Scheduler is already defined for this UserSchedule");
heuristic_params = scheduler->computeHeuristics(fusion(), runtime_info_ref);
heuristic_params = scheduler->computeHeuristics(
fusion(), runtime_info_ref, data_cache.get());
return heuristic_params.get();
}

Expand Down
4 changes: 4 additions & 0 deletions csrc/python_frontend/fusion_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <python_frontend/fusion_record.h>
#include <runtime/fusion_executor_cache.h>
#include <scheduler/compile_time_info.h>
#include <scheduler/registry.h>

#include <memory>
Expand All @@ -33,6 +34,9 @@ struct UserSchedule {
//! The parameters for scheduler heuristic.
std::unique_ptr<HeuristicParams> heuristic_params;

//! The compile-time data cache.
std::unique_ptr<HeuristicDataCache> data_cache;

//! Concretized, Scheduled Fusion IR
std::unique_ptr<Fusion> scheduled_fusion;

Expand Down
4 changes: 4 additions & 0 deletions csrc/python_frontend/fusion_definition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <python_frontend/fusion_definition.h>
#include <python_frontend/translation.h>
#include <runtime/executor_kernel_arg.h>
#include <scheduler/compile_time_info.h>
#include <scheduler/scheduler_types.h>
#include <utils.h>
#include <validator_utils.h>
Expand Down Expand Up @@ -239,6 +240,9 @@ void FusionDefinition::setupSchedule(
user_sched_ = fusionCache()->createUserSchedule(
scheds, inputs, device, overwrite_existing_schedule);

// Create scheduler data cache
user_sched_->data_cache = std::make_unique<HeuristicDataCache>();

// Building a new Fusion container for scheduling with definition such that
// the definition's tensor data members refer to the corresponding IR objects
// needed for scheduling. A simple copy of the container would mean the data
Expand Down
63 changes: 62 additions & 1 deletion csrc/python_frontend/python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@
#include <python_frontend/python_bindings.h>
#include <python_frontend/translation.h>
#include <runtime/fusion_kernel_runtime.h>
#include <scheduler/compile_time_info.h>
#include <scheduler/registry.h>
#include <scheduler/scheduler_types.h>
#include <scheduler/tools/inlining.h>
#include <scheduler/utils.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <transform_replay.h>
#include <iostream>
Expand Down Expand Up @@ -779,6 +781,44 @@ void initNvFuserPythonBindings(PyObject* module) {

defineHeuristicParamBindings(nvfuser);

py::class_<scheduler_utils::SchedulerHyperParameters> hyperparameters(
nvfuser, "SchedulerHyperParameters");
hyperparameters.def(py::init<int64_t, int64_t, int64_t, int64_t>());
hyperparameters.def_property(
"vectorize_factor",
[](scheduler_utils::SchedulerHyperParameters& self) {
return self.vectorize_factor;
},
[](scheduler_utils::SchedulerHyperParameters& self,
int64_t vectorize_factor_) {
self.vectorize_factor = vectorize_factor_;
});
hyperparameters.def_property(
"unroll_factor",
[](scheduler_utils::SchedulerHyperParameters& self) {
return self.unroll_factor;
},
[](scheduler_utils::SchedulerHyperParameters& self,
int64_t unroll_factor_) { self.unroll_factor = unroll_factor_; });
hyperparameters.def_property(
"threads_per_block_min",
[](scheduler_utils::SchedulerHyperParameters& self) {
return self.threads_per_block_min;
},
[](scheduler_utils::SchedulerHyperParameters& self,
int64_t threads_per_block_min_) {
self.threads_per_block_min = threads_per_block_min_;
});
hyperparameters.def_property(
"threads_per_block_max",
[](scheduler_utils::SchedulerHyperParameters& self) {
return self.threads_per_block_max;
},
[](scheduler_utils::SchedulerHyperParameters& self,
int64_t threads_per_block_max_) {
self.threads_per_block_max = threads_per_block_max_;
});

//! KernelProfiles are encapsulated in FusionProfiles where each KP
//! is associated with a segment.
py::class_<KernelProfile> kernel_prof(nvfuser, "KernelProfile");
Expand Down Expand Up @@ -1401,7 +1441,7 @@ void initNvFuserPythonBindings(PyObject* module) {
py::class_<FusionDefinition::Operators> nvf_ops(fusion_def, "Operators");
nvf_ops.def(py::init<FusionDefinition*>());

// ******************** INSERT OP BINDINGS BELOW HERE ********************
// ******************** INSERT OP BINDINGS BELOW HERE ********************
#define OP_PREFIX "Operators."
#define NVFUSER_PYTHON_BINDING_UNARY_OP(op_str, op_name) \
nvf_ops.def( \
Expand Down Expand Up @@ -3822,6 +3862,27 @@ void initNvFuserPythonBindings(PyObject* module) {
return *parameters->as<MatmulParams>();
},
py::return_value_policy::reference);
nvf_sched.def(
"schedule_hyperparameters",
[](FusionDefinition::SchedOperators& self)
-> scheduler_utils::SchedulerHyperParameters& {
NVF_CHECK(
self.validUse(),
"Attempting to use a SchedOperators Op prior to definition!");
UserSchedule* sched = self.fusion_definition->userSchedule();
auto scheduler_hyperparameters_entry = HeuristicDataCacheEntry<
HeuristicCompileTime::SchedulerHyperParameters>(
sched->data_cache.get(), []() {
return std::make_unique<
scheduler_utils::SchedulerHyperParameters>(
/*vectorize_factor=*/1,
/*unroll_factor=*/1,
/*threads_per_block_min=*/1,
/*threads_per_block_max=*/1);
});
return scheduler_hyperparameters_entry.get();
},
py::return_value_policy::reference);
}

void cleanup() {
Expand Down
12 changes: 11 additions & 1 deletion csrc/scheduler/compile_time_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ enum class CompileTimeEntryType {
CAN_SCHEDULE_TRANSPOSE,
CAN_SCHEDULE_MUL_SUM_AS_MMA,
LOGICAL_REORDER_MAP,
VECTORIZATION_BREAK_POINT_OF_RED_PROD
VECTORIZATION_BREAK_POINT_OF_RED_PROD,
SCHEDULE_HYPERPARAMETERS
};

//! Entry type definition class for `DOMAIN_MAP`,
Expand Down Expand Up @@ -203,6 +204,15 @@ class VectorizationBreakPointOfReductionProducer {
CompileTimeEntryType::VECTORIZATION_BREAK_POINT_OF_RED_PROD;
};

//! Entry type definition class for `SCHEDULE_HYPERPARAMETERS`,
//! stores hyperparameters for SchedulerEntry::computeHeuristics
class SchedulerHyperParameters {
public:
using DataType = scheduler_utils::SchedulerHyperParameters;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::SCHEDULE_HYPERPARAMETERS;
};

//! Base abstract class for unified storage in `HeuristicDataCache`,
//! each entry in `HeuristicDataCache` will be a subclass.
class CompileTimeInfoBase : public PolymorphicBase {
Expand Down
75 changes: 60 additions & 15 deletions csrc/scheduler/normalization_inner_outer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,9 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
SchedulerRuntimeInfo& runtime_info,
HeuristicDataCache* data_cache,
const std::vector<TensorView*>& reduction_tvs,
const int64_t vectorize_factor) {
const int64_t vectorize_factor,
const int64_t threads_per_block_min,
const int64_t threads_per_block_max) {
FUSER_PERF_SCOPE(
"normalization_inner_outer::getPersistentBufferStorageParams");

Expand Down Expand Up @@ -230,9 +232,7 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(

const auto dev_prop = at::cuda::getCurrentDeviceProperties();
int64_t smem_overhead = scheduler_utils::getSharedMemoryOverheadPerBlock(
fusion,
reduction_tvs,
InnerOuterPersistentKernelScheduler::threads_per_block_max);
fusion, reduction_tvs, threads_per_block_max);
int64_t available_smem =
(int64_t)dev_prop->sharedMemPerMultiprocessor - smem_overhead;
int64_t available_regs = scheduler_utils::register_file_size_56k;
Expand Down Expand Up @@ -281,8 +281,8 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
tv_buffer_size_regs,
dataTypeSize(current_tv->getDataType().value()),
vectorize_factor,
InnerOuterPersistentKernelScheduler::threads_per_block_min,
InnerOuterPersistentKernelScheduler::threads_per_block_max,
threads_per_block_min,
threads_per_block_max,
dev_prop->warpSize);
buffer_params.smem_buffer_size += tv_buffer_size_smem;

Expand Down Expand Up @@ -332,6 +332,8 @@ std::pair<int64_t, int64_t> getBufferBatchSizeAndThreadsPerBlock(
const int64_t outer_dim_numel,
const int64_t persistent_buffer_size,
const int64_t vectorize_factor,
const int64_t threads_per_block_min,
const int64_t threads_per_block_max,
const int64_t warp_size) {
// if inner_dim_numel <= 1024, we are doing multiple reductions per block
// with a constant batch size of 1 if vectorized. See Step 5 of
Expand Down Expand Up @@ -380,19 +382,16 @@ std::pair<int64_t, int64_t> getBufferBatchSizeAndThreadsPerBlock(
};

const int64_t after_vectorization = inner_dim_numel / vectorize_factor;
const int64_t threads_per_block_min = std::min(
after_vectorization,
InnerOuterPersistentKernelScheduler::threads_per_block_min);
const int64_t threads_per_block_max =
InnerOuterPersistentKernelScheduler::threads_per_block_max;
const int64_t threads_per_block_min_after_vectorization =
std::min(after_vectorization, threads_per_block_min);
const int64_t batch_min = getMinimumBatch();
const int64_t batch_max = getMaximumInnerOuterPersistentBufferBatch();

// Start from the smallest threads_per_block. If the corresponding batch size
// is larger than batch_max, try increase threads per block by a warp until
// the threads_per_block reaches threads_per_block_max or the batch size
// reaches batch_min.
int64_t threads_per_block = threads_per_block_min;
int64_t threads_per_block = threads_per_block_min_after_vectorization;
int64_t inner_batch = ceilDiv(after_vectorization, threads_per_block);
while (inner_batch > batch_max &&
threads_per_block + warp_size <= threads_per_block_max &&
Expand Down Expand Up @@ -432,6 +431,8 @@ std::unique_ptr<ReductionParams> innerOuterPersistentHeuristic(
const int64_t smem_overhead,
const size_t tmp_gmem_dtype_size,
const size_t vectorize_factor,
const int64_t threads_per_block_min,
const int64_t threads_per_block_max,
const bool project_to_input,
const PrimDataType index_type) {
auto rparams = std::make_unique<ReductionParams>(
Expand Down Expand Up @@ -512,6 +513,8 @@ std::unique_ptr<ReductionParams> innerOuterPersistentHeuristic(
outer_dim_numel,
regs_buffer_size,
iop.inner_vect,
threads_per_block_min,
threads_per_block_max,
dev_prop->warpSize);
iop.inner_batch = persistent_batch;

Expand Down Expand Up @@ -743,12 +746,32 @@ std::unique_ptr<ReductionParams> getInnerOuterPersistentHeuristics(
scheduler_utils::persistentBuffers(fusion));
});

auto scheduler_hyperparameters_entry =
HeuristicDataCacheEntry<HeuristicCompileTime::SchedulerHyperParameters>(
data_cache, [&]() {
return std::make_unique<scheduler_utils::SchedulerHyperParameters>(
/*vectorize_factor=*/vectorize_factor,
/*unroll_factor=*/1,
/*threads_per_block_min=*/
InnerOuterPersistentKernelScheduler::threads_per_block_min,
/*threads_per_block_max=*/
InnerOuterPersistentKernelScheduler::threads_per_block_max);
});
scheduler_utils::SchedulerHyperParameters& hp =
scheduler_hyperparameters_entry.get();

auto& persistent_buffer_info = persistent_buffer_info_entry.get();
NVF_ERROR(
!persistent_buffer_info.persistent_buffers.empty(),
"Persistent scheduler requires persistent buffers.");
auto buffer_params = getPersistentBufferStorageParams(
fusion, runtime_info, data_cache, reduction_tvs, vectorize_factor);
fusion,
runtime_info,
data_cache,
reduction_tvs,
hp.vectorize_factor,
hp.threads_per_block_min,
hp.threads_per_block_max);

std::unique_ptr<ReductionParams> rparams = innerOuterPersistentHeuristic(
properties.total_iteration_numel,
Expand All @@ -757,7 +780,9 @@ std::unique_ptr<ReductionParams> getInnerOuterPersistentHeuristics(
buffer_params.smem_buffer_size,
buffer_params.smem_overhead,
max_outer_reduction_dtype_size,
vectorize_factor,
hp.vectorize_factor,
hp.threads_per_block_min,
hp.threads_per_block_max,
buffer_params.project_to_input,
runtime_info.getIndexType());

Expand Down Expand Up @@ -1244,9 +1269,29 @@ bool InnerOuterPersistentKernelScheduler::canScheduleRunTime(
data_cache,
(int)(reduced_tv->nDims() - properties.inner_most_dimension_ndims));

auto scheduler_hyperparameters_entry =
HeuristicDataCacheEntry<HeuristicCompileTime::SchedulerHyperParameters>(
data_cache, [&]() {
return std::make_unique<scheduler_utils::SchedulerHyperParameters>(
/*vectorize_factor=*/vectorize_factor,
/*unroll_factor=*/1,
/*threads_per_block_min=*/
InnerOuterPersistentKernelScheduler::threads_per_block_min,
/*threads_per_block_max=*/
InnerOuterPersistentKernelScheduler::threads_per_block_max);
});
scheduler_utils::SchedulerHyperParameters& hp =
scheduler_hyperparameters_entry.get();

// check if there is enough register and shared memory for persistence
const auto buffer_params = getPersistentBufferStorageParams(
fusion, runtime_info, data_cache, reduction_tvs, vectorize_factor);
fusion,
runtime_info,
data_cache,
reduction_tvs,
hp.vectorize_factor,
hp.threads_per_block_min,
hp.threads_per_block_max);

const int64_t device_multiprocessor_count =
(int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
Expand Down
2 changes: 2 additions & 0 deletions csrc/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,4 +224,6 @@ template class HeuristicDataCacheEntry<
template class HeuristicDataCacheEntry<HeuristicCompileTime::LogicalReorderMap>;
template class HeuristicDataCacheEntry<
HeuristicCompileTime::VectorizationBreakPointOfReductionProducer>;
template class HeuristicDataCacheEntry<
HeuristicCompileTime::SchedulerHyperParameters>;
} // namespace nvfuser
28 changes: 28 additions & 0 deletions csrc/scheduler/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,34 @@ inline void parallelizeAllLike(
propagate_padding);
}

// Common hyperparameters used in heuristic scheduler. These hyperparameters
// are passed to SchedulerEntry::computeHeuristics through the
// HeuristicDataCache. These hyperparameters alter the generation of the
// HeuristicParams for the scheduler.
struct SchedulerHyperParameters {
SchedulerHyperParameters(
int64_t vectorize_factor_,
int64_t unroll_factor_,
int64_t threads_per_block_min_,
int64_t threads_per_block_max_)
: vectorize_factor(vectorize_factor_),
unroll_factor(unroll_factor_),
threads_per_block_min(threads_per_block_min_),
threads_per_block_max(threads_per_block_max_) {}

//! Number of elements to load per vectorize load.
int64_t vectorize_factor = 1;

//! Number of iterations to unroll for-loop.
int64_t unroll_factor = 1;

//! Minimum number of threads per block.
int64_t threads_per_block_min = 1;

//! Maximum number of threads per block.
int64_t threads_per_block_max = 1;
};

struct PersistentBufferInfo {
std::vector<TensorView*> persistent_buffers;
std::unordered_set<IterDomain*> unmappable_dims;
Expand Down
Loading

0 comments on commit c14d418

Please sign in to comment.