Skip to content

Commit

Permalink
Split vectors containing dead elements
Browse files Browse the repository at this point in the history
Use LiveElements analysis in GenXLegalization pass to see if there are
any dead elements in a vector. If so, split instruction to isolate
the dead parts, which will be removed by later passes
  • Loading branch information
mshelego authored and igcbot committed Sep 26, 2023
1 parent e2454bc commit 6c461d2
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 9 deletions.
35 changes: 34 additions & 1 deletion IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ SPDX-License-Identifier: MIT
#include "GenXAlignmentInfo.h"
#include "GenXBaling.h"
#include "GenXIntrinsics.h"
#include "GenXLiveElements.h"
#include "GenXSubtarget.h"
#include "GenXTargetMachine.h"
#include "GenXUtil.h"
Expand Down Expand Up @@ -213,6 +214,7 @@ struct LegalPredSize {
class GenXLegalization : public FunctionPass {
enum { DETERMINEWIDTH_UNBALE = 0, DETERMINEWIDTH_NO_SPLIT = 256 };
GenXBaling *Baling = nullptr;
GenXFuncLiveElements *LE = nullptr;
const GenXSubtarget *ST = nullptr;
DominatorTree *DT = nullptr;
ScalarEvolution *SE = nullptr;
Expand Down Expand Up @@ -406,6 +408,7 @@ void initializeGenXLegalizationPass(PassRegistry &);
INITIALIZE_PASS_BEGIN(GenXLegalization, "GenXLegalization", "GenXLegalization",
false, false)
INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling)
INITIALIZE_PASS_DEPENDENCY(GenXFuncLiveElements)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(GenXLegalization, "GenXLegalization", "GenXLegalization",
Expand All @@ -418,6 +421,7 @@ FunctionPass *llvm::createGenXLegalizationPass() {

void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GenXFuncBaling>();
AU.addRequired<GenXFuncLiveElements>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetPassConfig>();
AU.addRequired<DominatorTreeWrapperPass>();
Expand All @@ -430,6 +434,7 @@ void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const {
*/
bool GenXLegalization::runOnFunction(Function &F) {
Baling = &getAnalysis<GenXFuncBaling>();
LE = &getAnalysis<GenXFuncLiveElements>();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
ST = &getAnalysis<TargetPassConfig>()
.getTM<GenXTargetMachine>()
Expand Down Expand Up @@ -1141,14 +1146,42 @@ unsigned GenXLegalization::determineWidth(unsigned WholeWidth,
// * this legalization pass does not have access to FGs
ExecSizeAllowedBits &= 0x1f;

auto *Head = B.getHeadIgnoreGStore();
unsigned MainInstMinWidth =
1 << countTrailingZeros(ExecSizeAllowedBits, ZB_Undefined);

if (WholeWidth > 1 && MainInstMinWidth == 1) {
Value *Dest = Head->Inst;
if (Head->Info.Type == BaleInfo::WRREGION ||
Head->Info.Type == BaleInfo::WRPREDREGION ||
Head->Info.Type == BaleInfo::WRPREDPREDREGION)
Dest = Head->Inst->getOperand(1);
auto LiveElems = LE->getLiveElements(Dest);
if (LiveElems.canSplitDead()) {
IGC_ASSERT(LiveElems[0].size() == WholeWidth);
bool StartBit = LiveElems[0][StartIdx];
unsigned Idx = StartIdx + 1;
while (Idx < LiveElems[0].size() && LiveElems[0][Idx] == StartBit)
Idx++;
unsigned Size = Idx - StartIdx;
unsigned Mask = 1;
while (Mask < Size) {
Mask <<= 1;
Mask |= 1;
}
if (StartBit && 2 * Size != Mask + 1) {
Mask <<= 1;
Mask |= 1;
}
ExecSizeAllowedBits &= Mask;
}
}

// Determine the vector width that we need to split into.
bool IsReadSameVector = false;
unsigned Width = WholeWidth - StartIdx;
unsigned PredMinWidth = 1;
Value *WrRegionInput = nullptr;
auto Head = B.getHeadIgnoreGStore();
if (Head->Info.Type == BaleInfo::WRREGION)
WrRegionInput =
Head->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
Expand Down
15 changes: 9 additions & 6 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,6 @@ LiveElements LiveElementsAnalysis::getOperandLiveElements(
IGC_ASSERT(OperandNo < Inst->getNumOperands());
auto OpTy = Inst->getOperand(OperandNo)->getType();

if (InstLiveElems.isAllDead() && !Inst->mayHaveSideEffects())
return LiveElements(OpTy);

if (auto BCI = dyn_cast<BitCastInst>(Inst))
return getBitCastLiveElements(BCI, InstLiveElems);

Expand Down Expand Up @@ -299,10 +296,16 @@ LiveElements LiveElementsAnalysis::getOperandLiveElements(
if (ID == GenXIntrinsic::genx_addc || ID == GenXIntrinsic::genx_subb)
return getTwoDstInstLiveElements(InstLiveElems);

if (isElementWise(Inst))
return InstLiveElems;
auto OpLiveElems = LiveElements(OpTy, !InstLiveElems.isAllDead() ||
Inst->mayHaveSideEffects());
if (!isElementWise(Inst) || InstLiveElems.size() != OpLiveElems.size())
return OpLiveElems;

for (unsigned Idx = 0; Idx < InstLiveElems.size(); Idx++)
if (InstLiveElems[Idx].size() != OpLiveElems[Idx].size())
return OpLiveElems;

return LiveElements(OpTy, true);
return InstLiveElems;
}

// isRootInst : check if instruction should be the start point for backward
Expand Down
13 changes: 13 additions & 0 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,19 @@ class LiveElements {
});
}

bool canSplitDead() const {
if (size() > 1 || isAllDead() || !isAnyDead())
return false;
auto Bits = LiveElems[0];
int Idx = Bits.find_first();
if (Idx % 4)
return false;
while (Idx <= Bits.find_last())
if (!Bits[Idx++])
return false;
return true;
}

bool operator==(const LiveElements &Rhs) const {
return LiveElems == Rhs.LiveElems;
}
Expand Down
2 changes: 1 addition & 1 deletion IGC/VectorCompiler/test/GenXLegalization/debug_gstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ define void @test_transform(<128 x i8>* %a) !dbg !6 {
entry:
%0 = load <128 x i8>, <128 x i8>* %a, !dbg !12
call void @llvm.dbg.value(metadata <128 x i8> %0, metadata !9, metadata !DIExpression()), !dbg !12
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 16, i32 0), !dbg !13
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 0, i32 0), !dbg !13
call void @llvm.dbg.value(metadata <128 x i8> %1, metadata !9, metadata !DIExpression()), !dbg !13
store <128 x i8> %1, <128 x i8>* @global_vec, !dbg !14
ret void, !dbg !15
Expand Down
2 changes: 1 addition & 1 deletion IGC/VectorCompiler/test/Legalization/debug-gstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ define void @test_transform(<128 x i8>* %a) !dbg !6 {
entry:
%0 = load <128 x i8>, <128 x i8>* %a, !dbg !12
call void @llvm.dbg.value(metadata <128 x i8> %0, metadata !9, metadata !DIExpression()), !dbg !12
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 16, i32 0), !dbg !13
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 0, i32 0), !dbg !13
call void @llvm.dbg.value(metadata <128 x i8> %1, metadata !11, metadata !DIExpression()), !dbg !13
store <128 x i8> %1, <128 x i8>* @global_vec, !dbg !14
ret void, !dbg !15
Expand Down
85 changes: 85 additions & 0 deletions IGC/VectorCompiler/test/Legalization/live-elements.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
;=========================== begin_copyright_notice ============================
;
; Copyright (C) 2023 Intel Corporation
;
; SPDX-License-Identifier: MIT
;
;============================ end_copyright_notice =============================

; RUN: opt %use_old_pass_manager% -GenXLegalization -march=genx64 -mcpu=Gen9 -mtriple=spir64-unknown-unknown -S < %s | FileCheck %s

declare <16 x i32> @llvm.genx.oword.ld.v16i32(i32, i32, i32)

declare <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32>, i32, i32, i32, i16, i32)

declare void @llvm.genx.oword.st.v8i32(i32, i32, <8 x i32>)

; CHECK-LABEL: @test1
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT0]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[ADD_SPLIT0]], i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT8:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 32, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT8:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT8]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT8_JOIN8:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT8]], i32 0, i32 8, i32 1, i16 32, i32 undef, i1 true)
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT8_JOIN8]], i32 0, i32 8, i32 1, i16 0, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test1() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 0, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

; CHECK-LABEL: @test2
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <4 x i32> @llvm.genx.rdregioni.v4i32.v16i32.i16(<16 x i32> [[LOAD]], i32 4, i32 4, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <4 x i32> [[LOAD_SPLIT0]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v4i32.i16.i1(<16 x i32> undef, <4 x i32> [[ADD_SPLIT0]], i32 0, i32 4, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT4:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 16, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT4:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT4]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT4_JOIN4:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT4]], i32 0, i32 8, i32 1, i16 16, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT12:%[^ ]+]] = call <4 x i32> @llvm.genx.rdregioni.v4i32.v16i32.i16(<16 x i32> [[LOAD]], i32 4, i32 4, i32 1, i16 48, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT12:%[^ ]+]] = add <4 x i32> [[LOAD_SPLIT12]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT12_JOIN12:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v4i32.i16.i1(<16 x i32> [[ADD_SPLIT4_JOIN4]], <4 x i32> [[ADD_SPLIT12]], i32 0, i32 4, i32 1, i16 48, i32 undef, i1 true)
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT12_JOIN12]], i32 0, i32 8, i32 1, i16 16, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test2() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 16, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

; CHECK-LABEL: @test3
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT0]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[ADD_SPLIT0]], i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT8:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 32, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT8:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT8]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT8_JOIN8:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT8]], i32 0, i32 8, i32 1, i16 32, i32 undef, i1 true)
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT8_JOIN8]], i32 0, i32 8, i32 1, i16 32, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test3() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 32, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

; CHECK-LABEL: @test4
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[ADD:%[^ ]+]] = add <16 x i32> [[LOAD]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD]], i32 0, i32 8, i32 2, i16 0, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test4() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 2, i16 0, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

0 comments on commit 6c461d2

Please sign in to comment.