From 53edf4857985d7359c917bc2af6bcbf76b422a0e Mon Sep 17 00:00:00 2001 From: "Shelegov, Maksim" Date: Tue, 26 Sep 2023 15:10:10 +0000 Subject: [PATCH] Support allocas outside of entry block Although allocas basic blocks other that entry one can't be included in the functions prologue, they can be implemented as a standalone accesses to stack pointer --- .../GenXCodeGen/GenXPrologEpilogInsertion.cpp | 37 ++++++++--- .../lib/GenXCodeGen/GenXStackUsage.cpp | 10 ++- IGC/VectorCompiler/test/StackCalls/vla.ll | 62 ++++++++++++++++++- .../test/StackUsage/stack_vla.ll | 19 +++++- 4 files changed, 110 insertions(+), 18 deletions(-) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp index 607f2573b948..fa0d782fcc4b 100644 --- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp @@ -662,18 +662,41 @@ GenXPrologEpilogInsertion::generateStackCallProlog(Function &F, void GenXPrologEpilogInsertion::visitAllocaInst(AllocaInst &AI) { IGC_ASSERT(!AI.isUsedWithInAlloca()); const BasicBlock *Parent = AI.getParent(); - IGC_ASSERT_MESSAGE(Parent == &Parent->getParent()->front(), - "Allocas outside of entry block are not supported"); - Allocas.push_back(&AI); - if (!isa(AI.getArraySize())) { - HasVLA = true; + if (Parent == &Parent->getParent()->front()) { + Allocas.push_back(&AI); + if (!isa(AI.getArraySize())) + HasVLA = true; + } else { + IRBuilder<> IRB(&AI); + unsigned Alignment = getAllocaAlignment(&AI); + createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB, + Instruction::Add, Alignment - 1); + createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB, + Instruction::And, ~(Alignment - 1)); + Value *AllocaSize = nullptr; + if (isa(AI.getArraySize())) { + AllocaSize = IRB.getInt64( + divideCeil(*AI.getAllocationSizeInBits(*DL), genx::ByteBits)); + } else { + unsigned ElementSize = llvm::divideCeil( + DL->getTypeAllocSizeInBits(AI.getAllocatedType()), genx::ByteBits); + AllocaSize = + IRB.CreateMul(IRB.getInt64(ElementSize), + IRB.CreateZExt(AI.getOperand(0), IRB.getInt64Ty())); + } + auto [OrigSP, _] = + createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB, + Instruction::Add, AllocaSize, true); + auto *AllocaAddr = IRB.CreateIntToPtr(OrigSP, AI.getType(), AI.getName()); + AI.replaceAllUsesWith(AllocaAddr); + AI.eraseFromParent(); } } void GenXPrologEpilogInsertion::emitPrivateMemoryAllocations() { LLVM_DEBUG(dbgs() << "In emitPrivateMemoryAllocations\n"); if (Allocas.empty()) { - LLVM_DEBUG(dbgs() << "no alloca instructions in the function\n"); + LLVM_DEBUG(dbgs() << "no alloca instructions in the entry basic block\n"); return; } @@ -729,7 +752,7 @@ void GenXPrologEpilogInsertion::emitPrivateMemoryAllocations() { IRB.CreateAdd(AllocaSize, IRB.getInt64(Alignment - 1)), true); } createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB, - Instruction::And, ~(Alignment - 1), false); + Instruction::And, ~(Alignment - 1)); } else { unsigned AllocaSize = llvm::divideCeil(*AI->getAllocationSizeInBits(*DL), genx::ByteBits); diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp index 39591d5bcdbd..ed9e0a7cb104 100644 --- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp @@ -105,7 +105,7 @@ class StackAnalysis : public InstVisitor { uint64_t m_UsedSz{0}; alignment_t m_RequiredAlign{0}; bool m_HasIndirect{false}; - bool m_HasVLA{false}; + bool m_HasNonStatic{false}; Function *m_pHeavyFunction{nullptr}; ProcessingState m_ProcessingFlag{ProcessingState::NotStarted}; }; @@ -133,13 +133,11 @@ class StackAnalysis : public InstVisitor { // Collect all allocas and updates stack usage of each function void StackAnalysis::visitAllocaInst(AllocaInst &AI) { IGC_ASSERT(!AI.isUsedWithInAlloca()); - const BasicBlock *Parent = AI.getParent(); - IGC_ASSERT_MESSAGE(Parent == &Parent->getParent()->front(), "Allocas outside of entry block are not supported"); auto &CurFuncState = m_ProcessedFs[AI.getFunction()]; - if (!isa(AI.getArraySize())) { - CurFuncState.m_HasVLA = true; + if (!AI.isStaticAlloca()) { + CurFuncState.m_HasNonStatic = true; return; } @@ -179,7 +177,7 @@ StackAnalysis::checkFunction(Function &F) { // Can't predict stack usage if there are indirect calls // or variable length arrays - if (StateOfF.m_HasIndirect || StateOfF.m_HasVLA) + if (StateOfF.m_HasIndirect || StateOfF.m_HasNonStatic) return None; // if function is stack call, we do not know stack usage diff --git a/IGC/VectorCompiler/test/StackCalls/vla.ll b/IGC/VectorCompiler/test/StackCalls/vla.ll index a3336b493a86..d24a3e363727 100644 --- a/IGC/VectorCompiler/test/StackCalls/vla.ll +++ b/IGC/VectorCompiler/test/StackCalls/vla.ll @@ -14,8 +14,12 @@ target triple = "spir64-unknown-unknown" %struct = type { i8, float, i8 } -; CHECK-LABEL: test -define internal spir_func void @test(i32 %n1, i8 %n2)#0 { +declare i8* @llvm.stacksave() + +declare void @llvm.stackrestore(i8*) + +define internal spir_func void @test(i32 %n1, i8 %n2, i32 %n3, i32 %n4)#0 { +; CHECK-LABEL: entry entry: ; CHECK: [[A3_0:[^ ]+]] = zext i8 %n2 to i64 ; CHECK-NEXT: [[A3_1:[^ ]+]] = mul i64 1, [[A3_0]] @@ -66,6 +70,60 @@ entry: %a2 = alloca %struct, align 32 ; CHECK: [[A3:[^ ]+]] = inttoptr i64 [[A3_6]] to i8* %a3 = alloca i8, i8 %n2, align 64 + br label %loop + +; CHECK-LABEL: loop +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] +; CHECK: [[STACK:[^ ]+]] = tail call i8* @llvm.stacksave() + %stack = tail call i8* @llvm.stacksave() + %i.next = add i32 %i, 1 +; CHECK: [[A4_0:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef) +; CHECK-NEXT: [[A4_1:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_0]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A4_2:[^ ]+]] = add i64 [[A4_1]], 3 +; CHECK-NEXT: [[A4_3:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_2]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A4_4:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_3]]) +; CHECK-NEXT: [[A4_5:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef) +; CHECK-NEXT: [[A4_6:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_5]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A4_7:[^ ]+]] = and i64 [[A4_6]], 4294967292 +; CHECK-NEXT: [[A4_8:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_7]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A4_9:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_8]]) +; CHECK-NEXT: [[A4_10:[^ ]+]] = zext i32 %n4 to i64 +; CHECK-NEXT: [[A4_11:[^ ]+]] = mul i64 4, [[A4_10]] +; CHECK-NEXT: [[A4_12:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef) +; CHECK-NEXT: [[A4_13:[^ ]+]] = call <1 x i64> @llvm.genx.rdregioni.v1i64.v1i64.i16(<1 x i64> [[A4_12]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A4_14:[^ ]+]] = call <1 x i64> @llvm.genx.wrregioni.v1i64.v1i64.i16.i1(<1 x i64> undef, <1 x i64> [[A4_13]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A4_15:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_14]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A4_16:[^ ]+]] = add i64 [[A4_15]], [[A4_11]] +; CHECK-NEXT: [[A4_17:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_16]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A4_18:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_17]]) +; CHECK-NEXT: [[A4:[^ ]+]] = inttoptr i64 [[A4_15]] to i32* + %a4 = alloca i32, i32 %n4 +; CHECK: [[A5_0:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef) +; CHECK-NEXT: [[A5_1:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_0]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A5_2:[^ ]+]] = add i64 [[A5_1]], 31 +; CHECK-NEXT: [[A5_3:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_2]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A5_4:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_3]]) +; CHECK-NEXT: [[A5_5:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef) +; CHECK-NEXT: [[A5_6:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_5]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A5_7:[^ ]+]] = and i64 [[A5_6]], 4294967264 +; CHECK-NEXT: [[A5_8:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_7]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A5_9:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_8]]) +; CHECK-NEXT: [[A5_10:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef) +; CHECK-NEXT: [[A5_11:[^ ]+]] = call <1 x i64> @llvm.genx.rdregioni.v1i64.v1i64.i16(<1 x i64> [[A5_10]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A5_12:[^ ]+]] = call <1 x i64> @llvm.genx.wrregioni.v1i64.v1i64.i16.i1(<1 x i64> undef, <1 x i64> [[A5_11]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A5_13:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_12]], i32 0, i32 1, i32 1, i16 0, i32 undef) +; CHECK-NEXT: [[A5_14:[^ ]+]] = add i64 [[A5_13]], 12 +; CHECK-NEXT: [[A5_15:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_14]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true) +; CHECK-NEXT: [[A5_16:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_15]]) +; CHECK-NEXT: [[A5:[^ ]+]] = inttoptr i64 [[A5_13]] to %struct* + %a5 = alloca %struct, align 32 + %cond = icmp slt i32 %i.next, %n3 +; CHECK: tail call void @llvm.stackrestore(i8* [[STACK]]) + tail call void @llvm.stackrestore(i8* %stack) + br i1 %cond, label %loop, label %exit + +exit: ret void } diff --git a/IGC/VectorCompiler/test/StackUsage/stack_vla.ll b/IGC/VectorCompiler/test/StackUsage/stack_vla.ll index 53e34e4412bd..da2bbdc5b050 100644 --- a/IGC/VectorCompiler/test/StackUsage/stack_vla.ll +++ b/IGC/VectorCompiler/test/StackUsage/stack_vla.ll @@ -12,13 +12,26 @@ target datalayout = "e-p:64:64-i64:64-n8:16:32:64" -define dllexport spir_kernel void @main(i32 %n) #0 { - %ptr = alloca i32, i32 %n, align 4 ; CHECK-NOT: "VC.Stack.Amount" + +define dllexport spir_kernel void @test1(i32 %n) #0 { + %ptr = alloca i32, i32 %n, align 4 + ret void +} + +define dllexport spir_kernel void @test2() #0 { +entry: + br label %body + +body: + %ptr = alloca i32, align 4 ret void } attributes #0 = { "CMGenxMain" } !genx.kernel.internal = !{!0} -!0 = !{void (i32)* @main} +!0 = !{void (i32)* @test1} + +!genx.kernel.internal = !{!1} +!1 = !{void ()* @test2}