From e41de38e7c2629123aa35e55f973ccad90ae8808 Mon Sep 17 00:00:00 2001 From: "Shelegov, Maksim" Date: Wed, 20 Sep 2023 13:10:55 +0000 Subject: [PATCH] Split vectors containing dead elements Use LiveElements analysis in GenXLegalization pass to see if there are any dead elements in a vector. If so, split instruction to isolate the dead parts, which will be removed by later passes --- .../lib/GenXCodeGen/GenXLegalization.cpp | 32 ++++++- .../lib/GenXCodeGen/GenXLiveElements.cpp | 15 ++-- .../lib/GenXCodeGen/GenXLiveElements.h | 10 +++ .../test/GenXLegalization/debug_gstore.ll | 2 +- .../test/Legalization/debug-gstore.ll | 2 +- .../test/Legalization/live-elements.ll | 85 +++++++++++++++++++ 6 files changed, 137 insertions(+), 9 deletions(-) create mode 100644 IGC/VectorCompiler/test/Legalization/live-elements.ll diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp index 5d29d3aa9830..eee2b1eea9f4 100644 --- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp @@ -152,6 +152,7 @@ SPDX-License-Identifier: MIT #include "GenXAlignmentInfo.h" #include "GenXBaling.h" #include "GenXIntrinsics.h" +#include "GenXLiveElements.h" #include "GenXSubtarget.h" #include "GenXTargetMachine.h" #include "GenXUtil.h" @@ -213,6 +214,7 @@ struct LegalPredSize { class GenXLegalization : public FunctionPass { enum { DETERMINEWIDTH_UNBALE = 0, DETERMINEWIDTH_NO_SPLIT = 256 }; GenXBaling *Baling = nullptr; + GenXFuncLiveElements *LE = nullptr; const GenXSubtarget *ST = nullptr; DominatorTree *DT = nullptr; ScalarEvolution *SE = nullptr; @@ -406,6 +408,7 @@ void initializeGenXLegalizationPass(PassRegistry &); INITIALIZE_PASS_BEGIN(GenXLegalization, "GenXLegalization", "GenXLegalization", false, false) INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling) +INITIALIZE_PASS_DEPENDENCY(GenXFuncLiveElements) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(GenXLegalization, "GenXLegalization", "GenXLegalization", @@ -418,6 +421,7 @@ FunctionPass *llvm::createGenXLegalizationPass() { void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -430,6 +434,7 @@ void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const { */ bool GenXLegalization::runOnFunction(Function &F) { Baling = &getAnalysis(); + LE = &getAnalysis(); SE = &getAnalysis().getSE(); ST = &getAnalysis() .getTM() @@ -1141,6 +1146,32 @@ unsigned GenXLegalization::determineWidth(unsigned WholeWidth, // * this legalization pass does not have access to FGs ExecSizeAllowedBits &= 0x1f; + auto *Head = B.getHeadIgnoreGStore(); + + if (WholeWidth > 1) { + Value *Dest = Head->Inst; + if (Head->Info.Type == BaleInfo::WRREGION || + Head->Info.Type == BaleInfo::WRPREDREGION || + Head->Info.Type == BaleInfo::WRPREDPREDREGION) + Dest = Head->Inst->getOperand(1); + auto LiveElems = LE->getLiveElements(Dest); + if (LiveElems.canSplitDead()) { + IGC_ASSERT(LiveElems[0].size() == WholeWidth); + bool StartBit = LiveElems[0][StartIdx]; + unsigned Idx = StartIdx + 1; + while (Idx < LiveElems[0].size() && LiveElems[0][Idx] == StartBit) + Idx++; + unsigned Size = Idx - StartIdx; + unsigned Mask = 0; + while (Size) { + Mask <<= 1; + Mask |= 1; + Size >>= 1; + } + ExecSizeAllowedBits &= Mask; + } + } + unsigned MainInstMinWidth = 1 << countTrailingZeros(ExecSizeAllowedBits, ZB_Undefined); // Determine the vector width that we need to split into. @@ -1148,7 +1179,6 @@ unsigned GenXLegalization::determineWidth(unsigned WholeWidth, unsigned Width = WholeWidth - StartIdx; unsigned PredMinWidth = 1; Value *WrRegionInput = nullptr; - auto Head = B.getHeadIgnoreGStore(); if (Head->Info.Type == BaleInfo::WRREGION) WrRegionInput = Head->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.cpp index 068181fc4960..362a86fb3500 100644 --- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.cpp +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.cpp @@ -262,9 +262,6 @@ LiveElements LiveElementsAnalysis::getOperandLiveElements( IGC_ASSERT(OperandNo < Inst->getNumOperands()); auto OpTy = Inst->getOperand(OperandNo)->getType(); - if (InstLiveElems.isAllDead() && !Inst->mayHaveSideEffects()) - return LiveElements(OpTy); - if (auto BCI = dyn_cast(Inst)) return getBitCastLiveElements(BCI, InstLiveElems); @@ -299,10 +296,16 @@ LiveElements LiveElementsAnalysis::getOperandLiveElements( if (ID == GenXIntrinsic::genx_addc || ID == GenXIntrinsic::genx_subb) return getTwoDstInstLiveElements(InstLiveElems); - if (isElementWise(Inst)) - return InstLiveElems; + auto OpLiveElems = LiveElements(OpTy, !InstLiveElems.isAllDead() || + Inst->mayHaveSideEffects()); + if (!isElementWise(Inst) || InstLiveElems.size() != OpLiveElems.size()) + return OpLiveElems; + + for (unsigned Idx = 0; Idx < InstLiveElems.size(); Idx++) + if (InstLiveElems[Idx].size() != OpLiveElems[Idx].size()) + return OpLiveElems; - return LiveElements(OpTy, true); + return InstLiveElems; } // isRootInst : check if instruction should be the start point for backward diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.h index ef63cd37068a..d20880efe60a 100644 --- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.h +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.h @@ -79,6 +79,16 @@ class LiveElements { }); } + bool canSplitDead() const { + if (size() > 1 || isAllDead() || !isAnyDead()) + return false; + auto Bits = LiveElems[0]; + for (int Idx = Bits.find_first(); Idx <= Bits.find_last(); Idx++) + if (!Bits[Idx]) + return false; + return true; + } + bool operator==(const LiveElements &Rhs) const { return LiveElems == Rhs.LiveElems; } diff --git a/IGC/VectorCompiler/test/GenXLegalization/debug_gstore.ll b/IGC/VectorCompiler/test/GenXLegalization/debug_gstore.ll index 10709772ff72..c07bdec2c8ac 100644 --- a/IGC/VectorCompiler/test/GenXLegalization/debug_gstore.ll +++ b/IGC/VectorCompiler/test/GenXLegalization/debug_gstore.ll @@ -31,7 +31,7 @@ define void @test_transform(<128 x i8>* %a) !dbg !6 { entry: %0 = load <128 x i8>, <128 x i8>* %a, !dbg !12 call void @llvm.dbg.value(metadata <128 x i8> %0, metadata !9, metadata !DIExpression()), !dbg !12 - %1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 16, i32 0), !dbg !13 + %1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 0, i32 0), !dbg !13 call void @llvm.dbg.value(metadata <128 x i8> %1, metadata !9, metadata !DIExpression()), !dbg !13 store <128 x i8> %1, <128 x i8>* @global_vec, !dbg !14 ret void, !dbg !15 diff --git a/IGC/VectorCompiler/test/Legalization/debug-gstore.ll b/IGC/VectorCompiler/test/Legalization/debug-gstore.ll index a329f798459f..0b6bb8a4c1f5 100644 --- a/IGC/VectorCompiler/test/Legalization/debug-gstore.ll +++ b/IGC/VectorCompiler/test/Legalization/debug-gstore.ll @@ -29,7 +29,7 @@ define void @test_transform(<128 x i8>* %a) !dbg !6 { entry: %0 = load <128 x i8>, <128 x i8>* %a, !dbg !12 call void @llvm.dbg.value(metadata <128 x i8> %0, metadata !9, metadata !DIExpression()), !dbg !12 - %1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 16, i32 0), !dbg !13 + %1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 0, i32 0), !dbg !13 call void @llvm.dbg.value(metadata <128 x i8> %1, metadata !11, metadata !DIExpression()), !dbg !13 store <128 x i8> %1, <128 x i8>* @global_vec, !dbg !14 ret void, !dbg !15 diff --git a/IGC/VectorCompiler/test/Legalization/live-elements.ll b/IGC/VectorCompiler/test/Legalization/live-elements.ll new file mode 100644 index 000000000000..8e289d76eae3 --- /dev/null +++ b/IGC/VectorCompiler/test/Legalization/live-elements.ll @@ -0,0 +1,85 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2023 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= + +; RUN: opt %use_old_pass_manager% -GenXLegalization -march=genx64 -mcpu=Gen9 -mtriple=spir64-unknown-unknown -S < %s | FileCheck %s + +declare <16 x i32> @llvm.genx.oword.ld.v16i32(i32, i32, i32) + +declare <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32>, i32, i32, i32, i16, i32) + +declare void @llvm.genx.oword.st.v8i32(i32, i32, <8 x i32>) + +; CHECK-LABEL: @test1 +; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) +; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT0]], +; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[ADD_SPLIT0]], i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[LOAD_SPLIT8:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 32, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT8:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT8]], +; CHECK-NEXT: [[ADD_SPLIT8_JOIN8:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT8]], i32 0, i32 8, i32 1, i16 32, i32 undef, i1 true) +; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT8_JOIN8]], i32 0, i32 8, i32 1, i16 0, i32 undef) +; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]]) +define void @test1() { + %load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) + %add = add <16 x i32> %load, + %rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 0, i32 undef) + tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion) + ret void +} + +; CHECK-LABEL: @test2 +; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) +; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <4 x i32> @llvm.genx.rdregioni.v4i32.v16i32.i16(<16 x i32> [[LOAD]], i32 4, i32 4, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <4 x i32> [[LOAD_SPLIT0]], +; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v4i32.i16.i1(<16 x i32> undef, <4 x i32> [[ADD_SPLIT0]], i32 0, i32 4, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[LOAD_SPLIT4:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 16, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT4:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT4]], +; CHECK-NEXT: [[ADD_SPLIT4_JOIN4:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT4]], i32 0, i32 8, i32 1, i16 16, i32 undef, i1 true) +; CHECK-NEXT: [[LOAD_SPLIT12:%[^ ]+]] = call <4 x i32> @llvm.genx.rdregioni.v4i32.v16i32.i16(<16 x i32> [[LOAD]], i32 4, i32 4, i32 1, i16 48, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT12:%[^ ]+]] = add <4 x i32> [[LOAD_SPLIT12]], +; CHECK-NEXT: [[ADD_SPLIT12_JOIN12:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v4i32.i16.i1(<16 x i32> [[ADD_SPLIT4_JOIN4]], <4 x i32> [[ADD_SPLIT12]], i32 0, i32 4, i32 1, i16 48, i32 undef, i1 true) +; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT12_JOIN12]], i32 0, i32 8, i32 1, i16 16, i32 undef) +; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]]) +define void @test2() { + %load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) + %add = add <16 x i32> %load, + %rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 16, i32 undef) + tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion) + ret void +} + +; CHECK-LABEL: @test3 +; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) +; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT0]], +; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[ADD_SPLIT0]], i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[LOAD_SPLIT8:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 32, i32 undef) +; CHECK-NEXT: [[ADD_SPLIT8:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT8]], +; CHECK-NEXT: [[ADD_SPLIT8_JOIN8:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT8]], i32 0, i32 8, i32 1, i16 32, i32 undef, i1 true) +; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT8_JOIN8]], i32 0, i32 8, i32 1, i16 32, i32 undef) +; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]]) +define void @test3() { + %load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) + %add = add <16 x i32> %load, + %rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 32, i32 undef) + tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion) + ret void +} + +; CHECK-LABEL: @test4 +; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) +; CHECK-NEXT: [[ADD:%[^ ]+]] = add <16 x i32> [[LOAD]], +; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD]], i32 0, i32 8, i32 2, i16 0, i32 undef) +; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]]) +define void @test4() { + %load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0) + %add = add <16 x i32> %load, + %rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 2, i16 0, i32 undef) + tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion) + ret void +}