Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create dispatch system for executors #3263

Merged
merged 42 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6aa977f
Broke apart executors, tests will fail, need dispatch.
csarofeen Oct 20, 2024
c23c38d
Change name: FusionExecutor -> KernelExecutor
csarofeen Oct 20, 2024
bf835ec
Draft executor dispatch.
csarofeen Oct 20, 2024
3d7f017
Rename fe->ke for benchmarks.
csarofeen Oct 20, 2024
61f7cb5
Fix instrumentation in executors.
csarofeen Oct 20, 2024
48fdaef
Fix build with executor dispatch (still test failures). Make sure Fus…
csarofeen Oct 20, 2024
a44018e
Finish rename FusionExecutor fe -> KernelExecutor ke
csarofeen Oct 21, 2024
0e10199
All but one nvfuser_tests pass.
csarofeen Oct 23, 2024
8ed49a1
No op scheduler generates empty kernels.
csarofeen Oct 24, 2024
b8c3118
Rename HostIrExecutor to HostIrEvaluator, move HostIRExecutor to mult…
csarofeen Oct 25, 2024
21340b4
Rename inputes/outputBytesAccessed to computeBytes
csarofeen Oct 25, 2024
84c2332
In KernelExecutor rename compileFusion and runFusion to comile and run
csarofeen Oct 25, 2024
075c293
Remove executor_abstract.cpp as it wasn't used
csarofeen Oct 25, 2024
9229abd
Revert whitespace changes.
csarofeen Oct 25, 2024
fc5d788
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Oct 25, 2024
fe05a3a
Fix merge conflicts.
csarofeen Oct 25, 2024
76a100d
Fix matmul tests.
csarofeen Oct 26, 2024
55b24f6
Fix executor in host ir evaluator for host unit, send it to KernelExe…
csarofeen Nov 4, 2024
d372ae2
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Nov 4, 2024
097674d
Merge fixes.
csarofeen Nov 4, 2024
c917725
clang tidy
csarofeen Nov 4, 2024
cb14bb9
profiler fix
csarofeen Nov 4, 2024
88a58cd
Allow serializing uncompiled KernelExecutor.
csarofeen Nov 4, 2024
d829e41
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Nov 5, 2024
38cce1e
Merge fixes.
csarofeen Nov 5, 2024
68e2692
Fix serialization errors in executor dispatch (#3353)
rdspring1 Nov 6, 2024
2749521
PR Comments part 1.
csarofeen Nov 6, 2024
198e6d4
PR Comments part 2.
csarofeen Nov 6, 2024
f4acdb7
Merge branch 'main' into executor_dispatch
naoyam Nov 6, 2024
6312789
Merge branch 'main' into executor_dispatch
naoyam Nov 6, 2024
3f2503f
Last PR Comment cleanup.
csarofeen Nov 6, 2024
8290e5c
Merge branch 'main' into executor_dispatch
naoyam Nov 7, 2024
dccaef0
Avoid clang-tidy warning
naoyam Nov 7, 2024
9b1a4e4
Merge branch 'main' into executor_dispatch
naoyam Nov 7, 2024
b33c8e1
cleanup
naoyam Nov 8, 2024
0c8043f
cleanup
naoyam Nov 8, 2024
b1634f3
cleanup
naoyam Nov 8, 2024
6df7ae1
Merge branch 'main' into executor_dispatch
naoyam Nov 8, 2024
6ebc56f
cleanup
naoyam Nov 8, 2024
f10c3af
typo
naoyam Nov 8, 2024
5e24332
Set `fusion_id` and `device_id` of `KernelExecutor` in constructor fo…
rdspring1 Nov 8, 2024
fa68935
Merge branch 'main' into executor_dispatch
naoyam Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/rng.cpp
${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
${NVFUSER_SRCS_DIR}/runtime/executor.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_dispatch.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_params.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_utils.cpp
Expand Down
13 changes: 6 additions & 7 deletions benchmarks/cpp/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@ int64_t runBenchmarkIterations(

const auto& compile_log = executor_cache->getMostRecentExecutorInfo();
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
auto lparams = toString(
compile_log.fusion_executor->as<KernelExecutor>()->lastLaunchParams());
// Only set if not segmented. In the case of segmented fusions,
// this could be confusing as the log would refect only the last
// segment. Revisit if necessary.
Expand Down Expand Up @@ -223,19 +224,18 @@ int64_t runBenchmarkIterations(

int64_t runBenchmarkIterations(
benchmark::State& benchmark_state,
KernelExecutor* fusion_executor,
KernelExecutor* ke,
std::vector<c10::IValue>& aten_inputs,
const LaunchParams& launch_constraints,
CompileParams compile_params) {
int64_t io_bytes = getSizeOfInputs(aten_inputs);
{
// Warm-up run
auto cg_outputs =
fusion_executor->run(aten_inputs, launch_constraints, compile_params);
auto cg_outputs = ke->run(aten_inputs, launch_constraints, compile_params);
io_bytes += getSizeOfOutputs(cg_outputs);
}

auto lparams = toString(fusion_executor->lastLaunchParams());
auto lparams = toString(ke->lastLaunchParams());
benchmark_state.SetLabel(lparams);

// Sync everything up before we start
Expand All @@ -246,8 +246,7 @@ int64_t runBenchmarkIterations(
clearL2Cache();
FusionProfiler::start();
FusionProfiler::createSegments(1);
auto cg_outputs =
fusion_executor->run(aten_inputs, launch_constraints, compile_params);
auto cg_outputs = ke->run(aten_inputs, launch_constraints, compile_params);
FusionProfiler::stop();
benchmark_state.SetIterationTime(
FusionProfiler::profile().kernel_time_ms / 1000.0);
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cpp/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ int64_t runBenchmarkIterations(
//! kernel time is added to benchmark_state.
int64_t runBenchmarkIterations(
benchmark::State& benchmark_state,
KernelExecutor* fusion_executor,
KernelExecutor* ke,
std::vector<c10::IValue>& aten_inputs,
const LaunchParams& launch_constraints = LaunchParams(),
CompileParams compile_params = CompileParams());
Expand Down
7 changes: 0 additions & 7 deletions csrc/fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -807,13 +807,6 @@ bool Fusion::hasDynamicTransform() {
return !ir_utils::getTVsWithDynamicTransform(this).empty();
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just moved this function to executor.cpp as it wasn't used anywhere else.

bool isExpressionEvaluated(Fusion* fusion) {
return std::all_of(
fusion->outputs().begin(), fusion->outputs().end(), [&fusion](Val* out) {
return fusion->getOutputAlias(out).type == AllocationType::Evaluate;
});
}

namespace {
std::vector<TensorView*> findAllTvs(Fusion* fusion) {
auto used_vals = fusion->usedMathVals();
Expand Down
3 changes: 0 additions & 3 deletions csrc/fusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,4 @@ void Fusion::manage(std::string key, T data) {
});
}

// Returns true if all fusion outputs are expression evaluated.
bool isExpressionEvaluated(Fusion* fusion);

} // namespace nvfuser
6 changes: 2 additions & 4 deletions csrc/fusion_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,17 +326,15 @@ SegmentProfiler::SegmentProfiler(uint32_t id, bool cupti_disabled)
output_bytes_(0),
kernel_profile_state_(ProfilerState::Ready) {}

void SegmentProfiler::startCompile(int device) {
device_ = device;
void SegmentProfiler::startCompile() {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separated out set device as a separate function. KernelExecutor knows device on compilation since runtime information is needed for it, the other executors set it in run.

compile_timer_.start();
}

void SegmentProfiler::stopCompile() {
compile_timer_.stop();
}

void SegmentProfiler::startKernel(int device) {
device_ = device;
void SegmentProfiler::startKernel() {
NVF_CHECK(
kernel_profile_state_ == ProfilerState::Ready,
"ProfilerState is not Ready!",
Expand Down
17 changes: 10 additions & 7 deletions csrc/fusion_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,15 @@ class SegmentProfiler {
public:
SegmentProfiler(uint32_t id, bool cupti_disabled);

void startCompile(int device);
void startCompile();
void stopCompile();

void startKernel(int device);
void startKernel();
void stopKernel();

void setDevice(int64_t device) {
device_ = (int)device;
}
void inputBytesAccessed(int64_t bytes);
void outputBytesAccessed(int64_t bytes);

Expand Down Expand Up @@ -219,13 +222,13 @@ class SegmentProfiler {
private:
bool cupti_disabled_;

int device_;
uint32_t segment_id_;
int device_ = -1;
uint32_t segment_id_ = -1;

HostTimer compile_timer_;
int64_t input_bytes_;
int64_t output_bytes_;
std::string scheduler_;
int64_t input_bytes_ = -1;
int64_t output_bytes_ = -1;
std::string scheduler_ = "None";
ProfilerState kernel_profile_state_;
};

Expand Down
Loading
Loading