From 53edf4857985d7359c917bc2af6bcbf76b422a0e Mon Sep 17 00:00:00 2001
From: "Shelegov, Maksim" <maksim.shelegov@intel.com>
Date: Tue, 26 Sep 2023 15:10:10 +0000
Subject: [PATCH]  Support allocas outside of entry block

Although allocas basic blocks other that entry one can't be included in
the functions prologue, they can be implemented as a standalone accesses
to stack pointer
---
 .../GenXCodeGen/GenXPrologEpilogInsertion.cpp | 37 ++++++++---
 .../lib/GenXCodeGen/GenXStackUsage.cpp        | 10 ++-
 IGC/VectorCompiler/test/StackCalls/vla.ll     | 62 ++++++++++++++++++-
 .../test/StackUsage/stack_vla.ll              | 19 +++++-
 4 files changed, 110 insertions(+), 18 deletions(-)
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp
index 607f2573b948..fa0d782fcc4b 100644
--- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp
@@ -662,18 +662,41 @@ GenXPrologEpilogInsertion::generateStackCallProlog(Function &F,
 void GenXPrologEpilogInsertion::visitAllocaInst(AllocaInst &AI) {
   IGC_ASSERT(!AI.isUsedWithInAlloca());
   const BasicBlock *Parent = AI.getParent();
-  IGC_ASSERT_MESSAGE(Parent == &Parent->getParent()->front(),
-                     "Allocas outside of entry block are not supported");
-  Allocas.push_back(&AI);
-  if (!isa<ConstantInt>(AI.getArraySize())) {
-    HasVLA = true;
+  if (Parent == &Parent->getParent()->front()) {
+    Allocas.push_back(&AI);
+    if (!isa<ConstantInt>(AI.getArraySize()))
+      HasVLA = true;
+  } else {
+    IRBuilder<> IRB(&AI);
+    unsigned Alignment = getAllocaAlignment(&AI);
+    createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
+                         Instruction::Add, Alignment - 1);
+    createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
+                         Instruction::And, ~(Alignment - 1));
+    Value *AllocaSize = nullptr;
+    if (isa<ConstantInt>(AI.getArraySize())) {
+      AllocaSize = IRB.getInt64(
+          divideCeil(*AI.getAllocationSizeInBits(*DL), genx::ByteBits));
+    } else {
+      unsigned ElementSize = llvm::divideCeil(
+          DL->getTypeAllocSizeInBits(AI.getAllocatedType()), genx::ByteBits);
+      AllocaSize =
+          IRB.CreateMul(IRB.getInt64(ElementSize),
+                        IRB.CreateZExt(AI.getOperand(0), IRB.getInt64Ty()));
+    }
+    auto [OrigSP, _] =
+        createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
+                             Instruction::Add, AllocaSize, true);
+    auto *AllocaAddr = IRB.CreateIntToPtr(OrigSP, AI.getType(), AI.getName());
+    AI.replaceAllUsesWith(AllocaAddr);
+    AI.eraseFromParent();
   }
 }
 
 void GenXPrologEpilogInsertion::emitPrivateMemoryAllocations() {
   LLVM_DEBUG(dbgs() << "In emitPrivateMemoryAllocations\n");
   if (Allocas.empty()) {
-    LLVM_DEBUG(dbgs() << "no alloca instructions in the function\n");
+    LLVM_DEBUG(dbgs() << "no alloca instructions in the entry basic block\n");
     return;
   }
 
@@ -729,7 +752,7 @@ void GenXPrologEpilogInsertion::emitPrivateMemoryAllocations() {
             IRB.CreateAdd(AllocaSize, IRB.getInt64(Alignment - 1)), true);
       }
       createBinOpPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
-                           Instruction::And, ~(Alignment - 1), false);
+                           Instruction::And, ~(Alignment - 1));
     } else {
       unsigned AllocaSize =
           llvm::divideCeil(*AI->getAllocationSizeInBits(*DL), genx::ByteBits);
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp
index 39591d5bcdbd..ed9e0a7cb104 100644
--- a/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp
+++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXStackUsage.cpp
@@ -105,7 +105,7 @@ class StackAnalysis : public InstVisitor<StackAnalysis> {
     uint64_t m_UsedSz{0};
     alignment_t m_RequiredAlign{0};
     bool m_HasIndirect{false};
-    bool m_HasVLA{false};
+    bool m_HasNonStatic{false};
     Function *m_pHeavyFunction{nullptr};
     ProcessingState m_ProcessingFlag{ProcessingState::NotStarted};
   };
@@ -133,13 +133,11 @@ class StackAnalysis : public InstVisitor<StackAnalysis> {
 // Collect all allocas and updates stack usage of each function
 void StackAnalysis::visitAllocaInst(AllocaInst &AI) {
   IGC_ASSERT(!AI.isUsedWithInAlloca());
-  const BasicBlock *Parent = AI.getParent();
-  IGC_ASSERT_MESSAGE(Parent == &Parent->getParent()->front(), "Allocas outside of entry block are not supported");
 
   auto &CurFuncState = m_ProcessedFs[AI.getFunction()];
 
-  if (!isa<ConstantInt>(AI.getArraySize())) {
-    CurFuncState.m_HasVLA = true;
+  if (!AI.isStaticAlloca()) {
+    CurFuncState.m_HasNonStatic = true;
     return;
   }
 
@@ -179,7 +177,7 @@ StackAnalysis::checkFunction(Function &F) {
 
   // Can't predict stack usage if there are indirect calls
   // or variable length arrays
-  if (StateOfF.m_HasIndirect || StateOfF.m_HasVLA)
+  if (StateOfF.m_HasIndirect || StateOfF.m_HasNonStatic)
     return None;
 
   // if function is stack call, we do not know stack usage
diff --git a/IGC/VectorCompiler/test/StackCalls/vla.ll b/IGC/VectorCompiler/test/StackCalls/vla.ll
index a3336b493a86..d24a3e363727 100644
--- a/IGC/VectorCompiler/test/StackCalls/vla.ll
+++ b/IGC/VectorCompiler/test/StackCalls/vla.ll
@@ -14,8 +14,12 @@ target triple = "spir64-unknown-unknown"
 
 %struct = type { i8, float, i8 }
 
-; CHECK-LABEL: test
-define internal spir_func void @test(i32 %n1, i8 %n2)#0 {
+declare i8* @llvm.stacksave()
+
+declare void @llvm.stackrestore(i8*)
+
+define internal spir_func void @test(i32 %n1, i8 %n2, i32 %n3, i32 %n4)#0 {
+; CHECK-LABEL: entry
 entry:
 ; CHECK:       [[A3_0:[^ ]+]] = zext i8 %n2 to i64
 ; CHECK-NEXT:  [[A3_1:[^ ]+]] = mul i64 1, [[A3_0]]
@@ -66,6 +70,60 @@ entry:
   %a2 = alloca %struct, align 32
 ; CHECK: [[A3:[^ ]+]] = inttoptr i64 [[A3_6]] to i8*
   %a3 = alloca i8, i8 %n2, align 64
+  br label %loop
+
+; CHECK-LABEL: loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+; CHECK: [[STACK:[^ ]+]] = tail call i8* @llvm.stacksave()
+  %stack = tail call i8* @llvm.stacksave()
+  %i.next = add i32 %i, 1
+; CHECK:       [[A4_0:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
+; CHECK-NEXT:  [[A4_1:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_0]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT:  [[A4_2:[^ ]+]] = add i64 [[A4_1]], 3
+; CHECK-NEXT:  [[A4_3:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_2]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT:  [[A4_4:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_3]])
+; CHECK-NEXT:  [[A4_5:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
+; CHECK-NEXT:  [[A4_6:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_5]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT:  [[A4_7:[^ ]+]] = and i64 [[A4_6]], 4294967292
+; CHECK-NEXT:  [[A4_8:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_7]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT:  [[A4_9:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_8]])
+; CHECK-NEXT: [[A4_10:[^ ]+]] = zext i32 %n4 to i64
+; CHECK-NEXT: [[A4_11:[^ ]+]] = mul i64 4, [[A4_10]]
+; CHECK-NEXT: [[A4_12:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
+; CHECK-NEXT: [[A4_13:[^ ]+]] = call <1 x i64> @llvm.genx.rdregioni.v1i64.v1i64.i16(<1 x i64> [[A4_12]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT: [[A4_14:[^ ]+]] = call <1 x i64> @llvm.genx.wrregioni.v1i64.v1i64.i16.i1(<1 x i64> undef, <1 x i64> [[A4_13]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT: [[A4_15:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A4_14]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT: [[A4_16:[^ ]+]] = add i64 [[A4_15]], [[A4_11]]
+; CHECK-NEXT: [[A4_17:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A4_16]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT: [[A4_18:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A4_17]])
+; CHECK-NEXT: [[A4:[^ ]+]] = inttoptr i64 [[A4_15]] to i32*
+  %a4 = alloca i32, i32 %n4
+; CHECK:       [[A5_0:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
+; CHECK-NEXT:  [[A5_1:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_0]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT:  [[A5_2:[^ ]+]] = add i64 [[A5_1]], 31
+; CHECK-NEXT:  [[A5_3:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_2]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT:  [[A5_4:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_3]])
+; CHECK-NEXT:  [[A5_5:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
+; CHECK-NEXT:  [[A5_6:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_5]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT:  [[A5_7:[^ ]+]] = and i64 [[A5_6]], 4294967264
+; CHECK-NEXT:  [[A5_8:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_7]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT:  [[A5_9:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_8]])
+; CHECK-NEXT: [[A5_10:[^ ]+]] = call <1 x i64> @llvm.genx.read.predef.reg.v1i64.i64(i32 10, i64 undef)
+; CHECK-NEXT: [[A5_11:[^ ]+]] = call <1 x i64> @llvm.genx.rdregioni.v1i64.v1i64.i16(<1 x i64> [[A5_10]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT: [[A5_12:[^ ]+]] = call <1 x i64> @llvm.genx.wrregioni.v1i64.v1i64.i16.i1(<1 x i64> undef, <1 x i64> [[A5_11]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT: [[A5_13:[^ ]+]] = call i64 @llvm.genx.rdregioni.i64.v1i64.i16(<1 x i64> [[A5_12]], i32 0, i32 1, i32 1, i16 0, i32 undef)
+; CHECK-NEXT: [[A5_14:[^ ]+]] = add i64 [[A5_13]], 12
+; CHECK-NEXT: [[A5_15:[^ ]+]] = call i64 @llvm.genx.wrregioni.i64.i64.i16.i1(i64 undef, i64 [[A5_14]], i32 0, i32 1, i32 1, i16 0, i32 undef, i1 true)
+; CHECK-NEXT: [[A5_16:[^ ]+]] = call i64 @llvm.genx.write.predef.reg.i64.i64(i32 10, i64 [[A5_15]])
+; CHECK-NEXT: [[A5:[^ ]+]] = inttoptr i64 [[A5_13]] to %struct*
+  %a5 = alloca %struct, align 32
+  %cond = icmp slt i32 %i.next, %n3
+; CHECK: tail call void @llvm.stackrestore(i8* [[STACK]])
+  tail call void @llvm.stackrestore(i8* %stack)
+  br i1 %cond, label %loop, label %exit
+
+exit:
   ret void
 }
 
diff --git a/IGC/VectorCompiler/test/StackUsage/stack_vla.ll b/IGC/VectorCompiler/test/StackUsage/stack_vla.ll
index 53e34e4412bd..da2bbdc5b050 100644
--- a/IGC/VectorCompiler/test/StackUsage/stack_vla.ll
+++ b/IGC/VectorCompiler/test/StackUsage/stack_vla.ll
@@ -12,13 +12,26 @@
 
 target datalayout = "e-p:64:64-i64:64-n8:16:32:64"
 
-define dllexport spir_kernel void @main(i32 %n) #0 {
-  %ptr = alloca i32, i32 %n, align 4
 ; CHECK-NOT: "VC.Stack.Amount"
+
+define dllexport spir_kernel void @test1(i32 %n) #0 {
+  %ptr = alloca i32, i32 %n, align 4
+  ret void
+}
+
+define dllexport spir_kernel void @test2() #0 {
+entry:
+  br label %body
+
+body:
+  %ptr = alloca i32, align 4
   ret void
 }
 
 attributes #0 = { "CMGenxMain" }
 
 !genx.kernel.internal = !{!0}
-!0 = !{void (i32)* @main}
+!0 = !{void (i32)* @test1}
+
+!genx.kernel.internal = !{!1}
+!1 = !{void ()* @test2}