Skip to content

Commit

Permalink
Revert "Allow output to alias intermediate tensor, step-2 to solve gr…
Browse files Browse the repository at this point in the history
…oup norm segmentation issue #2375 (#2405)"

This reverts commit 15bdf9f.
  • Loading branch information
wujingyue committed Jul 21, 2024
1 parent 59a951f commit 7259833
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 204 deletions.
73 changes: 8 additions & 65 deletions csrc/alias_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <vector>

#include <alias_analysis.h>
#include <compute_at_map.h>
#include <dispatch.h>
#include <fusion.h>
#include <ir/interface_nodes.h>
Expand All @@ -18,8 +17,6 @@
#include <ir/utils.h>
#include <linked_hash_map.h>
#include <logical_domain_map.h>
#include <scheduler/reduction_utils.h>
#include <scheduler/registry_utils.h>

namespace nvfuser {

Expand Down Expand Up @@ -428,69 +425,18 @@ bool okToRelayout(
: tv->getMaybeAllocationDomain());
return new_layout.isCompliantWith({allocation, tv->getContiguity()});
}

// Returns true if the output tv's definition op may cause segmentation.
// Only considered view op interfering with reduction.
// TODO: other ops?
bool outputInterferingReduction(Fusion* fusion) {
// output must be defined as view op
const auto& output_tvs =
ir_utils::filterByType<TensorView>(fusion->outputs());
if (std::none_of(
output_tvs.begin(), output_tvs.end(), [](TensorView* out_tv) {
return out_tv->definition()->isA<ViewOp>();
})) {
return false;
}

// fusion must have reduction tvs
const auto& reduction_tvs = scheduler_utils::getReductionTvs(fusion);
if (reduction_tvs.empty()) {
return false;
}

ComputeAtMap ca_map(fusion);
if (registry_utils::requiresForwardViewReplay(fusion, ca_map)) {
return true;
}

// check if view op interferes with reduction
auto ref_redu_tv =
reduction_scheduler_utils::getRepresentativeReductionTv(reduction_tvs);
if (registry_utils::reductionInterferingView(fusion, ca_map, ref_redu_tv)) {
return true;
}

return false;
}

// Stop at view op
// TODO: other ops?
bool isOpsToStop(const Expr* expr, bool stop_at_view) {
return stop_at_view && expr->isA<ViewOp>();
}

} // namespace

void AliasAnalysisResult::finalize(
Fusion* fusion,
const bool can_override_empty_allocation_domain,
const bool may_alias_intermediate) {
// If allow output to alias intermediate, then we don't need to walk up
// the chain to find an input or output root. It changes the last reshape
// in group norm to a no-op.
bool stop_at_view =
may_alias_intermediate && outputInterferingReduction(fusion);
const bool can_override_empty_allocation_domain) {
for (auto [alias, source_and_layout] : alias_to_source_) {
auto [root, preferred_layout] = source_and_layout;
if (!isOpsToStop(alias->definition(), stop_at_view)) {
while (root != nullptr && !root->isFusionInput() &&
!root->isFusionOutput()) {
const auto i = alias_to_source_.find(root);
root = (i == alias_to_source_.end() ? nullptr : i->second.first);
}
// Walks up the `alias_to_source_` chain.
while (root != nullptr && !root->isFusionInput() &&
!root->isFusionOutput()) {
const auto i = alias_to_source_.find(root);
root = (i == alias_to_source_.end() ? nullptr : i->second.first);
}

if (root == nullptr) {
continue;
}
Expand Down Expand Up @@ -541,8 +487,7 @@ std::string AliasAnalysisResult::toString(const int indent_size) const {

AliasAnalysisResult findAliases(
Fusion* fusion,
const bool can_override_empty_allocation_domain,
const bool may_alias_intermediate) {
const bool can_override_empty_allocation_domain) {
AliasAnalysisResult analysis;
AliasFinder finder(analysis);
// Fusion::exprs() computes and returns topological order.
Expand All @@ -554,9 +499,7 @@ AliasAnalysisResult findAliases(
// results).
finder.dispatch(expr);
}

analysis.finalize(
fusion, can_override_empty_allocation_domain, may_alias_intermediate);
analysis.finalize(can_override_empty_allocation_domain);
return analysis;
}

Expand Down
8 changes: 2 additions & 6 deletions csrc/alias_analysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,7 @@ class AliasAnalysisResult {
// Computes transitive aliases and caches them in `alias_to_root_`.
// See `findAliases` for the meaning of
// `can_override_empty_allocation_domain`.
void finalize(
Fusion* fusion,
bool can_override_empty_allocation_domain,
bool can_alias_intermediate);
void finalize(bool can_override_empty_allocation_domain);

// Returns the preferred layout. If `alias` is not in `alias_to_source_`,
// returns the `TensorView`'s initial layout.
Expand Down Expand Up @@ -116,7 +113,6 @@ class AliasAnalysisResult {
// Fusion::aliasOutputToInput to mark aliases.
AliasAnalysisResult findAliases(
Fusion* fusion,
bool can_override_empty_allocation_domain = true,
bool may_alias_intermediate = false);
bool can_override_empty_allocation_domain = true);

} // namespace nvfuser
6 changes: 2 additions & 4 deletions csrc/preseg_passes/mark_aliases_prepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
namespace nvfuser::preseg_passes {

void MarkAliasesPreparePass::runPass(Fusion* fusion) {
const AliasAnalysisResult analysis = findAliases(
fusion,
/*can_override_empty_allocation_domain=*/true,
/*may_alias_intermediate=*/true);
const AliasAnalysisResult analysis =
findAliases(fusion, /*can_override_empty_allocation_domain=*/true);
if (isDebugDumpEnabled(DebugDumpOption::PreSegmenterLogging)) {
debug() << "Alias analysis result:" << std::endl;
debug() << analysis.toString(/*indent_size=*/1) << std::endl;
Expand Down
6 changes: 2 additions & 4 deletions csrc/scheduler/mark_aliases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,8 @@ void markAliases(Fusion* fusion) {
fusion->printMath();
}

const AliasAnalysisResult analysis = findAliases(
fusion,
/*can_override_empty_allocation_domain=*/false,
/*may_alias_intermediate=*/false);
const AliasAnalysisResult analysis =
findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
if (isDebugDumpEnabled(DebugDumpOption::SchedulerVerbose)) {
vlog("Alias analysis result:\n", analysis.toString(/*indent_size=*/1));
}
Expand Down
6 changes: 2 additions & 4 deletions csrc/scheduler/no_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@ NoOpScheduler::NoOpScheduler(

namespace {
bool allOutputsArePointerArithmetics(Fusion* fusion) {
const AliasAnalysisResult analysis = findAliases(
fusion,
/*can_override_empty_allocation_domain=*/false,
/*may_alias_intermediate=*/false);
const AliasAnalysisResult analysis =
findAliases(fusion, /*can_override_empty_allocation_domain=*/false);
auto out_tvs = ir_utils::filterByType<TensorView>(fusion->outputs());
return std::all_of(
out_tvs.begin(), out_tvs.end(), [&analysis, fusion](TensorView* out) {
Expand Down
15 changes: 0 additions & 15 deletions csrc/scheduler/reduction_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,20 +937,5 @@ std::ostream& operator<<(std::ostream& os, ReductionType reduction_type) {
return os;
}

TensorView* getRepresentativeReductionTv(
const std::vector<TensorView*>& reduction_tvs) {
TensorView* inner_ref = nullptr;
TensorView* outer_ref = nullptr;
for (auto tv : reduction_tvs) {
if (scheduler_utils::isFastestDimReduction(tv)) {
inner_ref = inner_ref == nullptr ? tv : inner_ref;
} else {
outer_ref = outer_ref == nullptr ? tv : outer_ref;
}
}
// use inner_ref for inner reduction and innerOuter reduction
// use outer_ref for outer reduction
return inner_ref != nullptr ? inner_ref : outer_ref;
}
} // namespace reduction_scheduler_utils
} // namespace nvfuser
8 changes: 0 additions & 8 deletions csrc/scheduler/reduction_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,5 @@ std::string toString(ReductionType reduction_type);
ReductionType getReductionType(Fusion* fusion);
ReductionType getReductionType(const std::vector<TensorView*>& reduction_tvs);

//! Get the representative reduction tv from the given reduction tvs.
//! If there are no reduction tvs, return nullptr.
//! If there are only inner reduction tvs, return the first inner reduction tv.
//! If there are only outer reduction tvs, return the first outer reduction tv.
//! If there are both inner and outer reduction tvs, return the first inner
//! reduction tv.
TensorView* getRepresentativeReductionTv(
const std::vector<TensorView*>& reduction_tvs);
} // namespace reduction_scheduler_utils
} // namespace nvfuser
100 changes: 2 additions & 98 deletions tests/cpp/test_gpu_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@
#include <iostream>

namespace nvfuser {
using testing::Contains;
using testing::UnorderedElementsAre;

using namespace at::indexing;

using GpuViewTest = NVFuserTest;

TEST_F(GpuViewTest, FusionViewDtypeSameSizeOutput) {
Expand Down Expand Up @@ -2380,102 +2379,6 @@ TEST_F(GpuViewTest, SplitMergePointwiseSplitMerge) {
testValidate(executor_cache.fusion(), {cg_outputs}, {t0}, __LINE__, __FILE__);
}

// segmented into 2 kernels: pointwise and reduction
TEST_F(GpuViewTest, GroupNormOriginal) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
const std::vector<int64_t> input_shape = {N, C, H, W};
const std::vector<int64_t> group_shape = {N, G, C / G, H, W};
const std::vector<int64_t> input_shape_wb = {C};
const std::vector<int64_t> group_shape_wb = {G, C / G};
DataType dtype = DataType::Half;
auto tv0 = makeContigTensor(input_shape.size(), dtype);
auto tv1 = makeContigTensor(input_shape_wb.size(), DataType::Float);
auto tv2 = makeContigTensor(input_shape_wb.size(), DataType::Float);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addInput(tv2);
// pointwise ops, e.g. cast
auto tv3 = castOp(DataType::Float, tv0);
// reshape from {N, C, H, W} to {N, G, C / G, H, W}
auto tv4 = reshape(tv3, input_shape, group_shape);
// normalization
auto tv5 = sum(tv4, {-1, -2, -3});
auto tv6 = broadcast(tv5, {false, false, true, true, true});
auto tv7 = div(tv4, tv6);
// reshape back to {N, C, H, W}
auto tv8 = reshape(tv7, group_shape, input_shape);
// pointwise ops, e.g. scale, bias, cast
auto tv9 = broadcast(tv1, {true, false, true, true});
auto tv10 = broadcast(tv2, {true, false, true, true});
auto tv11 = mul(tv8, tv9);
auto tv12 = add(tv11, tv10);
auto tv13 = castOp(dtype, tv12);
fusion->addOutput(tv13);

auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto options_wb = at::TensorOptions()
.dtype(data_type_to_aten(DataType::Float))
.device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, options);
auto tw = at::randn(input_shape_wb, options_wb);
auto tb = at::randn(input_shape_wb, options_wb);

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs({t0, tw, tb});
// should expect 1 after adding a pre-segment pass to move reshape to input
// and output.
EXPECT_THAT(
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(),
UnorderedElementsAre(
HeuristicIs(ScheduleHeuristic::PointWise),
HeuristicIs(ScheduleHeuristic::Reduction)));

testValidate(
executor_cache.fusion(), cg_outputs, {t0, tw, tb}, __LINE__, __FILE__);
}

TEST_F(GpuViewTest, OutputAliasIntermediate) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
const int64_t N = 2, C = 128, H = 16, W = 16, G = 32;
const std::vector<int64_t> input_shape = {N, G, C / G, H, W};
const std::vector<int64_t> output_shape = {N, C, H, W};
DataType dtype = DataType::Half;
auto tv0 = makeContigTensor(input_shape.size(), dtype);
fusion->addInput(tv0);
auto tv1 = castOp(DataType::Float, tv0);
auto tv2 = set(tv1);
auto tv3 = sum(tv2, {-1, -2, -3});
auto tv4 = broadcast(tv3, {false, false, true, true, true});
auto tv5 = div(tv2, tv4);
auto tv6 = castOp(dtype, tv5);
auto tv7 = reshape(tv6, input_shape, output_shape);
fusion->addOutput(tv7);

auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, options);
auto t1 = t0.to(at::kFloat);
auto t2 = t1.sum({-1, -2, -3}).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1);
auto t3 = t1 / t2;
auto t4 = t3.reshape(output_shape);

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs({t0});
const std::vector<SegmentedGroup*>& seg_groups =
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
EXPECT_THAT(
seg_groups, Contains(HeuristicIs(ScheduleHeuristic::NoOp)).Times(1));
EXPECT_THAT(
seg_groups,
Contains(HeuristicIs(ScheduleHeuristic::InnerPersistent)).Times(1));
testValidate(
executor_cache.fusion(), cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
}

using ReductionAxes = std::vector<int64_t>;
class ViewReductionTest : public NVFuserFixtureParamTest<ReductionAxes> {};

Expand Down Expand Up @@ -2619,4 +2522,5 @@ TEST_F(GpuViewTest, GroupNorm) {
testValidate(
executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
}

} // namespace nvfuser

0 comments on commit 7259833

Please sign in to comment.