Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow output to alias intermediate tensor, step-2 to solve group norm segmentation issue #2375 #2405

Merged
merged 31 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
957e561
reshape noop
liqiangxl Jun 8, 2024
a8b950f
wip
liqiangxl Jun 8, 2024
01ca998
wip
liqiangxl Jun 13, 2024
f4677fa
Merge branch 'llu/group_norm' of https://github.com/nvidia/fuser into…
liqiangxl Jun 13, 2024
6878ed4
Merge branch 'main' into llu/group_norm
liqiangxl Jun 13, 2024
8ff121d
clean
liqiangxl Jun 13, 2024
a11289b
revert alias analysis
liqiangxl Jun 13, 2024
fc88498
extend alias analysis
liqiangxl Jun 13, 2024
307642d
skip AllocationDomainPass
liqiangxl Jun 13, 2024
a5f7bd8
add tests
liqiangxl Jun 13, 2024
9ff6ffc
tests
liqiangxl Jun 13, 2024
8ace48b
Merge branch 'main' into llu/group_norm_step1_alias
liqiangxl Jun 18, 2024
ca19e6f
reshape in reduction/norm scheduler
liqiangxl Jun 18, 2024
a3e993d
Merge branch 'main' into llu/group_norm_step1_alias
liqiangxl Jun 25, 2024
d50e790
clean
liqiangxl Jun 25, 2024
7175220
only keep alias for output view intermediate
liqiangxl Jun 25, 2024
526d915
merge main
liqiangxl Jul 15, 2024
7a131d4
output alias intermediate
liqiangxl Jul 15, 2024
4e412c1
Merge branch 'main' into llu/group_norm_step1_alias
liqiangxl Jul 15, 2024
6ff7f1d
Merge branch 'main' into llu/group_norm_step1_alias
liqiangxl Jul 16, 2024
f8a24a7
resolve conflict
liqiangxl Jul 17, 2024
14b5f69
Update csrc/alias_analysis.h
liqiangxl Jul 17, 2024
2c80957
Update csrc/scheduler/no_op.cpp
liqiangxl Jul 17, 2024
87e8cf8
Update tests/cpp/test_gpu_view.cpp
liqiangxl Jul 17, 2024
ca2d25a
trivial fix
liqiangxl Jul 17, 2024
cc0fc7a
move check to finalize
liqiangxl Jul 17, 2024
2e2bc41
merge main
liqiangxl Jul 17, 2024
9cec99e
Merge branch 'main' into llu/group_norm_step1_alias
liqiangxl Jul 17, 2024
b23cf8b
Merge branch 'main' into llu/group_norm_step1_alias
liqiangxl Jul 18, 2024
c9a7c58
clang tidy
liqiangxl Jul 18, 2024
a622530
merge main
liqiangxl Jul 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 65 additions & 8 deletions csrc/alias_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <vector>

#include <alias_analysis.h>
#include <compute_at_map.h>
#include <dispatch.h>
#include <fusion.h>
#include <ir/interface_nodes.h>
Expand All @@ -17,6 +18,8 @@
#include <ir/utils.h>
#include <linked_hash_map.h>
#include <logical_domain_map.h>
#include <scheduler/reduction_utils.h>
#include <scheduler/registry_utils.h>

namespace nvfuser {

Expand Down Expand Up @@ -425,18 +428,69 @@ bool okToRelayout(
: tv->getMaybeAllocationDomain());
return new_layout.isCompliantWith({allocation, tv->getContiguity()});
}

// Returns true if the output tv's definition op may cause segmentation.
// Only considered view op interfering with reduction.
// TODO: other ops?
bool outputInterferingReduction(Fusion* fusion) {
// output must be defined as view op
const auto& output_tvs =
ir_utils::filterByType<TensorView>(fusion->outputs());
if (std::none_of(
output_tvs.begin(), output_tvs.end(), [](TensorView* out_tv) {
return out_tv->definition()->isA<ViewOp>();
})) {
return false;
}

// fusion must have reduction tvs
const auto& reduction_tvs = scheduler_utils::getReductionTvs(fusion);
if (reduction_tvs.empty()) {
return false;
}

ComputeAtMap ca_map(fusion);
if (registry_utils::requiresForwardViewReplay(fusion, ca_map)) {
return true;
}

// check if view op interferes with reduction
auto ref_redu_tv =
reduction_scheduler_utils::getRepresentativeReductionTv(reduction_tvs);
if (registry_utils::reductionInterferingView(fusion, ca_map, ref_redu_tv)) {
return true;
}

return false;
}

// Stop at view op
// TODO: other ops?
bool isOpsToStop(const Expr* expr, bool stop_at_view) {
return stop_at_view && expr->isA<ViewOp>();
}

} // namespace

void AliasAnalysisResult::finalize(
const bool can_override_empty_allocation_domain) {
Fusion* fusion,
const bool can_override_empty_allocation_domain,
const bool may_alias_intermediate) {
// If allow output to alias intermediate, then we don't need to walk up
// the chain to find an input or output root. It changes the last reshape
// in group norm to a no-op.
bool stop_at_view =
may_alias_intermediate && outputInterferingReduction(fusion);
for (auto [alias, source_and_layout] : alias_to_source_) {
auto [root, preferred_layout] = source_and_layout;
// Walks up the `alias_to_source_` chain.
while (root != nullptr && !root->isFusionInput() &&
!root->isFusionOutput()) {
const auto i = alias_to_source_.find(root);
root = (i == alias_to_source_.end() ? nullptr : i->second.first);
if (!isOpsToStop(alias->definition(), stop_at_view)) {
while (root != nullptr && !root->isFusionInput() &&
!root->isFusionOutput()) {
const auto i = alias_to_source_.find(root);
root = (i == alias_to_source_.end() ? nullptr : i->second.first);
}
}

if (root == nullptr) {
continue;
}
Expand Down Expand Up @@ -487,7 +541,8 @@ std::string AliasAnalysisResult::toString(const int indent_size) const {

AliasAnalysisResult findAliases(
Fusion* fusion,
const bool can_override_empty_allocation_domain) {
const bool can_override_empty_allocation_domain,
const bool may_alias_intermediate) {
AliasAnalysisResult analysis;
AliasFinder finder(analysis);
// Fusion::exprs() computes and returns topological order.
Expand All @@ -499,7 +554,9 @@ AliasAnalysisResult findAliases(
// results).
finder.dispatch(expr);
}
analysis.finalize(can_override_empty_allocation_domain);

analysis.finalize(
fusion, can_override_empty_allocation_domain, may_alias_intermediate);
return analysis;
}

Expand Down
8 changes: 6 additions & 2 deletions csrc/alias_analysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ class AliasAnalysisResult {
// Computes transitive aliases and caches them in `alias_to_root_`.
// See `findAliases` for the meaning of
// `can_override_empty_allocation_domain`.
void finalize(bool can_override_empty_allocation_domain);
void finalize(
Fusion* fusion,
bool can_override_empty_allocation_domain,
bool can_alias_intermediate);

// Returns the preferred layout. If `alias` is not in `alias_to_source_`,
// returns the `TensorView`'s initial layout.
Expand Down Expand Up @@ -113,6 +116,7 @@ class AliasAnalysisResult {
// Fusion::aliasOutputToInput to mark aliases.
AliasAnalysisResult findAliases(
Fusion* fusion,
bool can_override_empty_allocation_domain = true);
bool can_override_empty_allocation_domain = true,
bool may_alias_intermediate = false);

} // namespace nvfuser
6 changes: 4 additions & 2 deletions csrc/preseg_passes/mark_aliases_prepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
namespace nvfuser::preseg_passes {

void MarkAliasesPreparePass::runPass(Fusion* fusion) {
const AliasAnalysisResult analysis =
findAliases(fusion, /*can_override_empty_allocation_domain=*/true);
const AliasAnalysisResult analysis = findAliases(
fusion,
/*can_override_empty_allocation_domain=*/true,
/*may_alias_intermediate=*/true);
if (isDebugDumpEnabled(DebugDumpOption::PreSegmenterLogging)) {
debug() << "Alias analysis result:" << std::endl;
debug() << analysis.toString(/*indent_size=*/1) << std::endl;
Expand Down
6 changes: 4 additions & 2 deletions csrc/scheduler/mark_aliases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ void markAliases(Fusion* fusion) {
fusion->printMath();
}

const AliasAnalysisResult analysis =
findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
const AliasAnalysisResult analysis = findAliases(
fusion,
/*can_override_empty_allocation_domain=*/false,
/*may_alias_intermediate=*/false);
if (isDebugDumpEnabled(DebugDumpOption::SchedulerVerbose)) {
vlog("Alias analysis result:\n", analysis.toString(/*indent_size=*/1));
}
Expand Down
6 changes: 4 additions & 2 deletions csrc/scheduler/no_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ NoOpScheduler::NoOpScheduler(

namespace {
bool allOutputsArePointerArithmetics(Fusion* fusion) {
const AliasAnalysisResult analysis =
findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
const AliasAnalysisResult analysis = findAliases(
fusion,
/*can_override_empty_allocation_domain=*/false,
/*may_alias_intermediate=*/false);
auto out_tvs = ir_utils::filterByType<TensorView>(fusion->outputs());
return std::all_of(
out_tvs.begin(), out_tvs.end(), [&analysis](TensorView* out) {
Expand Down
15 changes: 15 additions & 0 deletions csrc/scheduler/reduction_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,5 +937,20 @@ std::ostream& operator<<(std::ostream& os, ReductionType reduction_type) {
return os;
}

TensorView* getRepresentativeReductionTv(
const std::vector<TensorView*>& reduction_tvs) {
TensorView* inner_ref = nullptr;
TensorView* outer_ref = nullptr;
for (auto tv : reduction_tvs) {
if (scheduler_utils::isFastestDimReduction(tv)) {
inner_ref = inner_ref == nullptr ? tv : inner_ref;
} else {
outer_ref = outer_ref == nullptr ? tv : outer_ref;
}
}
// use inner_ref for inner reduction and innerOuter reduction
// use outer_ref for outer reduction
return inner_ref != nullptr ? inner_ref : outer_ref;
}
} // namespace reduction_scheduler_utils
} // namespace nvfuser
8 changes: 8 additions & 0 deletions csrc/scheduler/reduction_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,5 +105,13 @@ std::string toString(ReductionType reduction_type);
ReductionType getReductionType(Fusion* fusion);
ReductionType getReductionType(const std::vector<TensorView*>& reduction_tvs);

//! Get the representative reduction tv from the given reduction tvs.
//! If there are no reduction tvs, return nullptr.
//! If there are only inner reduction tvs, return the first inner reduction tv.
//! If there are only outer reduction tvs, return the first outer reduction tv.
//! If there are both inner and outer reduction tvs, return the first inner
//! reduction tv.
TensorView* getRepresentativeReductionTv(
const std::vector<TensorView*>& reduction_tvs);
} // namespace reduction_scheduler_utils
} // namespace nvfuser
119 changes: 118 additions & 1 deletion tests/cpp/test_gpu_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
#include <iostream>

namespace nvfuser {
using testing::Contains;

using namespace at::indexing;

using GpuViewTest = NVFuserTest;

TEST_F(GpuViewTest, FusionViewDtypeSameSizeOutput) {
Expand Down Expand Up @@ -2379,4 +2379,121 @@ TEST_F(GpuViewTest, SplitMergePointwiseSplitMerge) {
testValidate(executor_cache.fusion(), {cg_outputs}, {t0}, __LINE__, __FILE__);
}

namespace {
MATCHER_P(HeuristicIs, heuristic, "") {
return arg->heuristic() == heuristic;
}
} // namespace

// segmented into 3 kernels: pointwise, normalization, and pointwise
TEST_F(GpuViewTest, GroupNormOriginal) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
const std::vector<int64_t> input_shape = {N, C, H, W};
const std::vector<int64_t> group_shape = {N, G, C / G, H, W};
const std::vector<int64_t> input_shape_wb = {C};
const std::vector<int64_t> group_shape_wb = {G, C / G};
DataType dtype = DataType::Half;
auto tv0 = makeContigTensor(input_shape.size(), dtype);
auto tv1 = makeContigTensor(input_shape_wb.size(), DataType::Float);
auto tv2 = makeContigTensor(input_shape_wb.size(), DataType::Float);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addInput(tv2);
// pointwise ops, e.g. cast
auto tv3 = castOp(DataType::Float, tv0);
// reshape from {N, C, H, W} to {N, G, C / G, H, W}
auto tv4 = reshape(tv3, input_shape, group_shape);
// normalization
auto tv5 = sum(tv4, {-1, -2, -3});
auto tv6 = broadcast(tv5, {false, false, true, true, true});
auto tv7 = div(tv4, tv6);
// reshape back to {N, C, H, W}
auto tv8 = reshape(tv7, group_shape, input_shape);
// pointwise ops, e.g. scale, bias, cast
auto tv9 = broadcast(tv1, {true, false, true, true});
auto tv10 = broadcast(tv2, {true, false, true, true});
auto tv11 = mul(tv8, tv9);
auto tv12 = add(tv11, tv10);
auto tv13 = castOp(dtype, tv12);
fusion->addOutput(tv13);

auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto options_wb = at::TensorOptions()
.dtype(data_type_to_aten(DataType::Float))
.device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, options);
auto tw = at::randn(input_shape_wb, options_wb);
auto tb = at::randn(input_shape_wb, options_wb);
auto t1 = t0.reshape(group_shape).to(at::kFloat);
auto t2 = t1.sum({-1, -2, -3}).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1);
auto t3 = t1 / t2;
auto t4 = t3.reshape(input_shape);
auto t5 = tw.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
auto t6 = tb.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
auto t7 = t4.mul(t5).add(t6);

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs({t0, tw, tb});
// should expect 1 after adding a pre-segment pass to move reshape to input
// and output.
auto seg_groups =
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();

EXPECT_THAT(
seg_groups, Contains(HeuristicIs(ScheduleHeuristic::PointWise)).Times(2));
EXPECT_THAT(
seg_groups,
Contains(HeuristicIs(ScheduleHeuristic::InnerPersistent)).Times(1));

testValidate(
executor_cache.fusion(),
cg_outputs,
{t0, tw, tb},
{t7},
__LINE__,
__FILE__);
}

TEST_F(GpuViewTest, OutputAliasIntermediate) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
const std::vector<int64_t> input_shape = {N, G, C / G, H, W};
const std::vector<int64_t> output_shape = {N, C, H, W};
DataType dtype = DataType::Half;
auto tv0 = makeContigTensor(input_shape.size(), dtype);
fusion->addInput(tv0);
auto tv1 = castOp(DataType::Float, tv0);
auto tv2 = set(tv1);
auto tv3 = sum(tv2, {-1, -2, -3});
auto tv4 = broadcast(tv3, {false, false, true, true, true});
auto tv5 = div(tv2, tv4);
auto tv6 = castOp(dtype, tv5);
auto tv7 = reshape(tv6, input_shape, output_shape);
fusion->addOutput(tv7);

auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, options);
auto t1 = t0.to(at::kFloat);
auto t2 = t1.sum({-1, -2, -3}).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1);
auto t3 = t1 / t2;
auto t4 = t3.reshape(output_shape);

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs({t0});
const std::vector<SegmentedGroup*>& seg_groups =
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
EXPECT_THAT(
seg_groups, Contains(HeuristicIs(ScheduleHeuristic::NoOp)).Times(1));
EXPECT_THAT(
seg_groups,
Contains(HeuristicIs(ScheduleHeuristic::InnerPersistent)).Times(1));
testValidate(
executor_cache.fusion(), cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
}

} // namespace nvfuser
Loading