Skip to content

Commit

Permalink
Support allocas outside of entry block
Browse files Browse the repository at this point in the history
Although allocas basic blocks other that entry one can't be included in
the functions prologue, they can be implemented as a standalone accesses
to stack pointer
  • Loading branch information
mshelego authored and igcbot committed Sep 26, 2023
1 parent 49ca28d commit 53edf48
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 18 deletions.
37 changes: 30 additions & 7 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,18 +662,41 @@ GenXPrologEpilogInsertion::generateStackCallProlog(Function &F,
void GenXPrologEpilogInsertion::visitAllocaInst(AllocaInst &AI) {
IGC_ASSERT(!AI.isUsedWithInAlloca());
const BasicBlock *Parent = AI.getParent();
IGC_ASSERT_MESSAGE(Parent == &Parent->getParent()->front(),
"Allocas outside of entry block are not supported");
Allocas.push_back(&AI);
if (!isa<ConstantInt>(AI.getArraySize())) {
HasVLA = true;
if (Parent == &Parent->getParent()->front()) {
Allocas.push_back(&AI);
if (!isa<ConstantInt>(AI.getArraySize()))
HasVLA = true;
} else {
IRBuilder<> IRB(&AI);
unsigned Alignment = getAllocaAlignment(&AI);
createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
Instruction::Add, Alignment - 1);
createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
Instruction::And, ~(Alignment - 1));
Value *AllocaSize = nullptr;
if (isa<ConstantInt>(AI.getArraySize())) {
AllocaSize = IRB.getInt64(
divideCeil(*AI.getAllocationSizeInBits(*DL), genx::ByteBits));
} else {
unsigned ElementSize = llvm::divideCeil(
DL->getTypeAllocSizeInBits(AI.getAllocatedType()), genx::ByteBits);
AllocaSize =
IRB.CreateMul(IRB.getInt64(ElementSize),
IRB.CreateZExt(AI.getOperand(0), IRB.getInt64Ty()));
}
auto [OrigSP, _] =
createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
Instruction::Add, AllocaSize, true);
auto *AllocaAddr = IRB.CreateIntToPtr(OrigSP, AI.getType(), AI.getName());
AI.replaceAllUsesWith(AllocaAddr);
AI.eraseFromParent();
}
}

void GenXPrologEpilogInsertion::emitPrivateMemoryAllocations() {
LLVM_DEBUG(dbgs() << "In emitPrivateMemoryAllocations\n");
if (Allocas.empty()) {
LLVM_DEBUG(dbgs() << "no alloca instructions in the function\n");
LLVM_DEBUG(dbgs() << "no alloca instructions in the entry basic block\n");
return;
}

Expand Down Expand Up @@ -729,7 +752,7 @@ void GenXPrologEpilogInsertion::emitPrivateMemoryAllocations() {
IRB.CreateAdd(AllocaSize, IRB.getInt64(Alignment - 1)), true);
}
createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
Instruction::And, ~(Alignment - 1), false);
Instruction::And, ~(Alignment - 1));
} else {
unsigned AllocaSize =
llvm::divideCeil(*AI->getAllocationSizeInBits(*DL), genx::ByteBits);
Expand Down
10 changes: 4 additions & 6 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class StackAnalysis : public InstVisitor<StackAnalysis> {
uint64_t m_UsedSz{0};
alignment_t m_RequiredAlign{0};
bool m_HasIndirect{false};
bool m_HasVLA{false};
bool m_HasNonStatic{false};
Function *m_pHeavyFunction{nullptr};
ProcessingState m_ProcessingFlag{ProcessingState::NotStarted};
};
Expand Down Expand Up @@ -133,13 +133,11 @@ class StackAnalysis : public InstVisitor<StackAnalysis> {
// Collect all allocas and updates stack usage of each function
void StackAnalysis::visitAllocaInst(AllocaInst &AI) {
IGC_ASSERT(!AI.isUsedWithInAlloca());
const BasicBlock *Parent = AI.getParent();
IGC_ASSERT_MESSAGE(Parent == &Parent->getParent()->front(), "Allocas outside of entry block are not supported");

auto &CurFuncState = m_ProcessedFs[AI.getFunction()];

if (!isa<ConstantInt>(AI.getArraySize())) {
CurFuncState.m_HasVLA = true;
if (!AI.isStaticAlloca()) {
CurFuncState.m_HasNonStatic = true;
return;
}

Expand Down Expand Up @@ -179,7 +177,7 @@ StackAnalysis::checkFunction(Function &F) {

// Can't predict stack usage if there are indirect calls
// or variable length arrays
if (StateOfF.m_HasIndirect || StateOfF.m_HasVLA)
if (StateOfF.m_HasIndirect || StateOfF.m_HasNonStatic)
return None;

// if function is stack call, we do not know stack usage
Expand Down
62 changes: 60 additions & 2 deletions IGC/VectorCompiler/test/StackCalls/vla.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@ target triple = "spir64-unknown-unknown"

%struct = type { i8, float, i8 }

; CHECK-LABEL: test
define internal spir_func void @test(i32 %n1, i8 %n2)#0 {
declare i8* @llvm.stacksave()

declare void @llvm.stackrestore(i8*)

define internal spir_func void @test(i32 %n1, i8 %n2, i32 %n3, i32 %n4)#0 {
; CHECK-LABEL: entry
entry:
; CHECK: [[A3_0:[^ ]+]] = zext i8 %n2 to i64
; CHECK-NEXT: [[A3_1:[^ ]+]] = mul i64 1, [[A3_0]]
Expand Down Expand Up @@ -66,6 +70,60 @@ entry:
%a2 = alloca %struct, align 32
; CHECK: [[A3:[^ ]+]] = inttoptr i64 [[A3_6]] to i8*
%a3 = alloca i8, i8 %n2, align 64
br label %loop

; CHECK-LABEL: loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
; CHECK: [[STACK:[^ ]+]] = tail call i8* @llvm.stacksave()
%stack = tail call i8* @llvm.stacksave()
%i.next = add i32 %i, 1
; CHECK: [[A4_0:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
; CHECK-NEXT: [[A4_1:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_0]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A4_2:[^ ]+]] = add i64 [[A4_1]], 3
; CHECK-NEXT: [[A4_3:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_2]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A4_4:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_3]])
; CHECK-NEXT: [[A4_5:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
; CHECK-NEXT: [[A4_6:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_5]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A4_7:[^ ]+]] = and i64 [[A4_6]], 4294967292
; CHECK-NEXT: [[A4_8:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_7]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A4_9:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_8]])
; CHECK-NEXT: [[A4_10:[^ ]+]] = zext i32 %n4 to i64
; CHECK-NEXT: [[A4_11:[^ ]+]] = mul i64 4, [[A4_10]]
; CHECK-NEXT: [[A4_12:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
; CHECK-NEXT: [[A4_13:[^ ]+]] = call <1 x i64> @llvm.genx.rdregioni.v1i64.v1i64.i16(<1 x i64> [[A4_12]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A4_14:[^ ]+]] = call <1 x i64> @llvm.genx.wrregioni.v1i64.v1i64.i16.i1(<1 x i64> undef, <1 x i64> [[A4_13]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A4_15:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_14]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A4_16:[^ ]+]] = add i64 [[A4_15]], [[A4_11]]
; CHECK-NEXT: [[A4_17:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_16]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A4_18:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_17]])
; CHECK-NEXT: [[A4:[^ ]+]] = inttoptr i64 [[A4_15]] to i32*
%a4 = alloca i32, i32 %n4
; CHECK: [[A5_0:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
; CHECK-NEXT: [[A5_1:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_0]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A5_2:[^ ]+]] = add i64 [[A5_1]], 31
; CHECK-NEXT: [[A5_3:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_2]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A5_4:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_3]])
; CHECK-NEXT: [[A5_5:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
; CHECK-NEXT: [[A5_6:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_5]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A5_7:[^ ]+]] = and i64 [[A5_6]], 4294967264
; CHECK-NEXT: [[A5_8:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_7]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A5_9:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_8]])
; CHECK-NEXT: [[A5_10:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
; CHECK-NEXT: [[A5_11:[^ ]+]] = call <1 x i64> @llvm.genx.rdregioni.v1i64.v1i64.i16(<1 x i64> [[A5_10]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A5_12:[^ ]+]] = call <1 x i64> @llvm.genx.wrregioni.v1i64.v1i64.i16.i1(<1 x i64> undef, <1 x i64> [[A5_11]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A5_13:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_12]], i32 0, i32 1, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[A5_14:[^ ]+]] = add i64 [[A5_13]], 12
; CHECK-NEXT: [[A5_15:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_14]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[A5_16:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_15]])
; CHECK-NEXT: [[A5:[^ ]+]] = inttoptr i64 [[A5_13]] to %struct*
%a5 = alloca %struct, align 32
%cond = icmp slt i32 %i.next, %n3
; CHECK: tail call void @llvm.stackrestore(i8* [[STACK]])
tail call void @llvm.stackrestore(i8* %stack)
br i1 %cond, label %loop, label %exit

exit:
ret void
}

Expand Down
19 changes: 16 additions & 3 deletions IGC/VectorCompiler/test/StackUsage/stack_vla.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,26 @@

target datalayout = "e-p:64:64-i64:64-n8:16:32:64"

define dllexport spir_kernel void @main(i32 %n) #0 {
%ptr = alloca i32, i32 %n, align 4
; CHECK-NOT: "VC.Stack.Amount"

define dllexport spir_kernel void @test1(i32 %n) #0 {
%ptr = alloca i32, i32 %n, align 4
ret void
}

define dllexport spir_kernel void @test2() #0 {
entry:
br label %body

body:
%ptr = alloca i32, align 4
ret void
}

attributes #0 = { "CMGenxMain" }

!genx.kernel.internal = !{!0}
!0 = !{void (i32)* @main}
!0 = !{void (i32)* @test1}

!genx.kernel.internal = !{!1}
!1 = !{void ()* @test2}

0 comments on commit 53edf48

Please sign in to comment.