From e982c19f3ab86befcd381d94a2ed549b98615b73 Mon Sep 17 00:00:00 2001 From: "Dmitrichenko, Aleksei" Date: Mon, 16 Dec 2024 20:06:57 +0000 Subject: [PATCH] Add more aggressive late rescheduling Add more aggressive late rescheduling phase in CodeLoopSinking pass. This is a phase that happens after the code loop sinking when it's not enough, unlike existing "early" rescheduling that aims to avoid sinking when possible. Currently this more aggressive sinking putting the instructions in between DPASes in a long DPAS sequence, but tries to split the sequence only in points when the subsequent DPASes don't share any common parameters. This commit also adds an option to disable max sinking heuristic in the presence of 2d block reads. --- IGC/Compiler/CISACodeGen/CodeSinking.cpp | 266 ++++++++++++------ IGC/Compiler/CISACodeGen/CodeSinking.hpp | 2 +- .../2d-blockload-loopsink-sch-heur.ll | 4 +- IGC/common/igc_flags.h | 7 +- 4 files changed, 191 insertions(+), 88 deletions(-) diff --git a/IGC/Compiler/CISACodeGen/CodeSinking.cpp b/IGC/Compiler/CISACodeGen/CodeSinking.cpp index 8a156699891b..75a1b9fb53cf 100644 --- a/IGC/Compiler/CISACodeGen/CodeSinking.cpp +++ b/IGC/Compiler/CISACodeGen/CodeSinking.cpp @@ -243,6 +243,22 @@ namespace IGC { return std::count_if(llvm::inst_begin(F), llvm::inst_end(F), [](const auto& I){ return !isDbgIntrinsic(&I); }); } + static bool isDPAS(Value *V) + { + GenIntrinsicInst *Intr = dyn_cast(V); + if (!Intr) + return false; + switch (Intr->getIntrinsicID()) + { + case GenISAIntrinsic::GenISA_dpas: + case GenISAIntrinsic::GenISA_sub_group_dpas: + return true; + default: + break; + } + return false; + }; + /// ===================== /// /// Non-loop code sinking /// /// ===================== /// @@ -796,6 +812,7 @@ namespace IGC { // Sink in the loop if loop preheader's potential to sink covers at least 20% of registers delta // between grf number and max estimated pressure in the loop #define LOOPSINK_PREHEADER_IMPACT_THRESHOLD 0.2 +#define LOOPSINK_RESCHEDULE_ITERATIONS 5 // Helper functions for loop sink debug dumps #define PrintDump(Level, Contents) if (IGC_IS_FLAG_ENABLED(DumpLoopSink) && (Level <= IGC_GET_FLAG_VALUE(LoopSinkDumpLevel))) {*LogStream << Contents;} @@ -1022,7 +1039,7 @@ namespace IGC { if (is2dBlockRead(&I)) { PrintDump(VerbosityLevel::Low, ">> Loop has 2D block reads. Enabling loads rescheduling and sinking.\n"); - return LoopSinkMode::FullSink; + return IGC_IS_FLAG_ENABLED(LoopSinkForce2dBlockReadsMaxSink) ? LoopSinkMode::FullSink : LoopSinkMode::SinkWhileRegpressureIsHigh; } } } @@ -1265,7 +1282,37 @@ namespace IGC { return LeafInstToCandidate; }; + auto rescheduleCandidates = [&] (BasicBlock *BB, CandidateVec &SinkedCandidates, InstToCandidateMap &CurrentInstToCandidate, const int MaxLocalSchedulingIterations, bool Aggressive = false) + { + bool Changed = false; + + CandidatePtrVec SinkedCandidatesPtrs; + for (auto CI = SinkedCandidates.begin(), CE = SinkedCandidates.end(); CI != CE; CI++) + { + Candidate *C = CI->get(); + if (C->TgtBB == BB) + SinkedCandidatesPtrs.push_back(C); + } + + // Sinking the candidates that don't use other candidates iteratively + // Should end with break, using max number of iterations (MaxLocalSchedulingIterations) just to avoid an infinite loop + for (int i = 0; i < MaxLocalSchedulingIterations; i++) + { + PrintDump(VerbosityLevel::Medium, "Local scheduling iteration " << i << "...\n"); + InstToCandidateMap LeafCurrentInstToCandidate = getLeafInstToCandidateMap(BB, SinkedCandidatesPtrs, CurrentInstToCandidate); + if (LeafCurrentInstToCandidate.empty()) + { + PrintDump(VerbosityLevel::Medium, "No more candidates to schedule in this block.\n"); + break; + } + Changed |= localSink(BB, LeafCurrentInstToCandidate, Aggressive); + } + + return Changed; + }; + bool ReschedulingIteration = IGC_IS_FLAG_ENABLED(LoopSinkEnableLoadsRescheduling); + bool LateReschedulingIteration = false; auto createSimpleCandidates = [&]( InstSet &SkipInstructions, @@ -1362,6 +1409,8 @@ namespace IGC { InstSet SkipInstructions; + int SinkIterations = 0; + do { CurrentSinkCandidates.clear(); @@ -1430,9 +1479,10 @@ namespace IGC { } } } - else + else if (!LateReschedulingIteration) { PrintDump(VerbosityLevel::Low, "Starting sinking iteration...\n"); + SinkIterations++; for (auto &Pair : InstToCandidate) SkipInstructions.insert(Pair.first); @@ -1446,48 +1496,57 @@ namespace IGC { // Create simple (1-instr) candidates for sinking by traversing the preheader once createSimpleCandidates(SkipInstructions, CurrentSinkCandidates); } - - // Make decisions for "MaybeSink" candidates - CandidateVec ToSink = refineLoopSinkCandidates(CurrentSinkCandidates, LoadChains, L); + else + { + PrintDump(VerbosityLevel::Low, "Late rescheduling iteration...\n"); + } // Sink the beneficial instructions bool IterChanged = false; - for (auto &C : ToSink) + IterChanged |= LateReschedulingIteration; + + if (!LateReschedulingIteration) { - if (C->Worthiness == LoopSinkWorthiness::Sink || C->Worthiness == LoopSinkWorthiness::IntraLoopSink) + // Make decisions for "MaybeSink" candidates + CandidateVec ToSink = refineLoopSinkCandidates(CurrentSinkCandidates, LoadChains, L); + + for (auto &C : ToSink) { - IGC_ASSERT(C->size() > 0); + if (C->Worthiness == LoopSinkWorthiness::Sink || C->Worthiness == LoopSinkWorthiness::IntraLoopSink) + { + IGC_ASSERT(C->size() > 0); - SinkedCandidates.push_back(std::move(C)); - Candidate *SC = SinkedCandidates.back().get(); + SinkedCandidates.push_back(std::move(C)); + Candidate *SC = SinkedCandidates.back().get(); - bool SinkFromPH = SC->Worthiness == LoopSinkWorthiness::Sink; - Instruction *InsertPoint = SinkFromPH ? - &*SC->TgtBB->getFirstInsertionPt() : SC->first()->getNextNode(); + bool SinkFromPH = SC->Worthiness == LoopSinkWorthiness::Sink; + Instruction *InsertPoint = SinkFromPH ? + &*SC->TgtBB->getFirstInsertionPt() : SC->first()->getNextNode(); - for (Instruction *I : *SC) { - PrintDump(VerbosityLevel::Medium, (SinkFromPH ? "Sinking instruction:\n" : "Scheduling instruction for local sink:\n")); - PrintInstructionDump(VerbosityLevel::Medium, I); + for (Instruction *I : *SC) { + PrintDump(VerbosityLevel::Medium, (SinkFromPH ? "Sinking instruction:\n" : "Scheduling instruction for local sink:\n")); + PrintInstructionDump(VerbosityLevel::Medium, I); - CurrentInstToCandidate[I] = SC; - InstToCandidate[I] = SC; + CurrentInstToCandidate[I] = SC; + InstToCandidate[I] = SC; - I->moveBefore(InsertPoint); - InsertPoint = I; + I->moveBefore(InsertPoint); + InsertPoint = I; - if (SinkFromPH) - { - if (isAllowedLoad(I) || isLoadChain(I, LoadChains)) - LoadChains.insert(I); + if (SinkFromPH) + { + if (isAllowedLoad(I) || isLoadChain(I, LoadChains)) + LoadChains.insert(I); + } } - } - UndoBlkSet.insert(SC->UndoPos->getParent()); - LocalBlkSet.insert(SC->TgtBB); + UndoBlkSet.insert(SC->UndoPos->getParent()); + LocalBlkSet.insert(SC->TgtBB); - PrintDump(VerbosityLevel::Medium, "\n"); - IterChanged = true; + PrintDump(VerbosityLevel::Medium, "\n"); + IterChanged = true; + } } } @@ -1506,37 +1565,29 @@ namespace IGC { uint SinkedSizeInBytes = RPE->estimateSizeInBytes(InstsSet, *F, SIMD, WI); uint SinkedSizeInRegs = RPE->bytesToRegisters(SinkedSizeInBytes); + if (LateReschedulingIteration) + { + for (auto &C : SinkedCandidates) + { + LocalBlkSet.insert(C->TgtBB); + } + } + // Invoke localSink() to move def to its first use if (LocalBlkSet.size() > 0) { - CandidatePtrVec SinkedCandidatesPtrs; for (auto BI = LocalBlkSet.begin(), BE = LocalBlkSet.end(); BI != BE; BI++) { BasicBlock *BB = *BI; if (ReschedulingIteration) { - SinkedCandidatesPtrs.clear(); - for (auto CI = SinkedCandidates.begin(), CE = SinkedCandidates.end(); CI != CE; CI++) - { - Candidate *C = CI->get(); - if (C->TgtBB == BB) - SinkedCandidatesPtrs.push_back(C); - } - // Sinking the candidates that don't use other candidates iteratively - // Should end with break, setting max number of iterations just to avoid an infinite loop - const int MaxLocalSchedulingIterations = 5; - for (int i = 0; i < MaxLocalSchedulingIterations; i++) - { - PrintDump(VerbosityLevel::Medium, "Local scheduling iteration " << i << "...\n"); - InstToCandidateMap LeafCurrentInstToCandidate = getLeafInstToCandidateMap(BB, SinkedCandidatesPtrs, CurrentInstToCandidate); - if (LeafCurrentInstToCandidate.empty()) - { - PrintDump(VerbosityLevel::Medium, "No more candidates to schedule in this block.\n"); - break; - } - localSink(BB, LeafCurrentInstToCandidate); - } + rescheduleCandidates(BB, SinkedCandidates, CurrentInstToCandidate, LOOPSINK_RESCHEDULE_ITERATIONS); + } + else if (LateReschedulingIteration) + { + InstToCandidateMap InstToCandidateCopy = InstToCandidate; + rescheduleCandidates(BB, SinkedCandidates, InstToCandidateCopy, LOOPSINK_RESCHEDULE_ITERATIONS + SinkIterations, true); } else // sinking iteration { @@ -1546,21 +1597,25 @@ namespace IGC { LocalBlkSet.clear(); } - if (MaxLoopPressure - SinkedSizeInRegs > NeededRegpressure) - { - // Heuristic to save recalculation of liveness - // The size of the candidates set is not enough to reach the needed regpressure - PrintDump(VerbosityLevel::Low, "Running one more iteration without recalculating liveness...\n"); - RecomputeMaxLoopPressure = true; - ReschedulingIteration = false; - continue; - } + if (!LateReschedulingIteration) // do one more sinking iteration only if it's a sinking iteration + if (MaxLoopPressure - SinkedSizeInRegs > NeededRegpressure) + { + // Heuristic to save recalculation of liveness + // The size of the candidates set is not enough to reach the needed regpressure + PrintDump(VerbosityLevel::Low, "Running one more iteration without recalculating liveness...\n"); + RecomputeMaxLoopPressure = true; + ReschedulingIteration = false; + continue; + } rerunLiveness(); MaxLoopPressure = getMaxRegCountForLoop(L); RecomputeMaxLoopPressure = false; PrintDump(VerbosityLevel::Low, "New max loop pressure = " << MaxLoopPressure << "\n"); + if (LateReschedulingIteration) + break; + if ((MaxLoopPressure < NeededRegpressure) && (Mode == LoopSinkMode::SinkWhileRegpressureIsHigh)) { @@ -1577,13 +1632,19 @@ namespace IGC { } } } - else if (!ReschedulingIteration) + else if (!ReschedulingIteration) // sinking iteration { if (!AllowLoadSinking && IGC_IS_FLAG_ENABLED(EnableLoadsLoopSink)) { PrintDump(VerbosityLevel::Low, "Allowing loads...\n"); AllowLoadSinking = true; } + else if (!AchievedNeededRegpressure && + Mode == LoopSinkMode::SinkWhileRegpressureIsHigh && + IGC_IS_FLAG_ENABLED(LoopSinkEnableLateRescheduling)) + { + LateReschedulingIteration = true; + } else { PrintDump(VerbosityLevel::Low, "Nothing to sink, finished.\n"); @@ -2824,23 +2885,57 @@ namespace IGC { // Sink to the use within basic block bool CodeLoopSinking::localSink( BasicBlock *BB, - InstToCandidateMap &InstToCandidate + InstToCandidateMap &InstToCandidate, + bool Aggressive ) { - auto isPartOfUnsplittableGroup = [](Instruction *Inst) + auto isPartOfUnsplittableGroup = [&](Instruction *Inst) { - if (GenIntrinsicInst *Intr = dyn_cast(Inst)) + auto haveCommonParameter = [](Instruction *Inst, Instruction *PrevInst) { - switch (Intr->getIntrinsicID()) + for (unsigned i = 0; i < Inst->getNumOperands(); ++i) { - case GenISAIntrinsic::GenISA_dpas: - case GenISAIntrinsic::GenISA_sub_group_dpas: - if (IGC_IS_FLAG_ENABLED(LoopSinkAvoidSplittingDPAS)) - return true; - default: - break; + for (unsigned j = 0; j < PrevInst->getNumOperands(); ++j) + { + Instruction *OpI = dyn_cast(Inst->getOperand(i)); + Instruction *OpPI = dyn_cast(PrevInst->getOperand(j)); + if (OpI && OpPI && OpI == OpPI) + return true; + } } + return false; + }; + + if (IGC_IS_FLAG_ENABLED(LoopSinkAvoidSplittingDPAS) && isDPAS(Inst)) + { + if (!Aggressive) + return true; + + // Aggressive local scheduling allows to sink in between DPASes + // But we place only between DPAS instructions that don't have common parameters + // (heuristic) + PrintDump(VerbosityLevel::High, "Checking DPAS:\n"); + PrintInstructionDump(VerbosityLevel::High, Inst); + + Instruction *PrevInst = Inst->getPrevNode(); + if (!PrevInst || !isDPAS(PrevInst)) + { + if (PrevInst) + { + PrintDump(VerbosityLevel::High, "Previous instruction is not DPAS:\n"); + PrintInstructionDump(VerbosityLevel::High, PrevInst); + } + return false; + } + + PrintDump(VerbosityLevel::High, "Checking previous DPAS:\n"); + PrintInstructionDump(VerbosityLevel::High, PrevInst); + + bool HCP = haveCommonParameter(Inst, PrevInst); + PrintDump(VerbosityLevel::High, "Have common parameter: " << HCP << "\n"); + return HCP; } + return false; }; @@ -2851,7 +2946,10 @@ namespace IGC { bool BreakAfterGroup = isPartOfUnsplittableGroup(StartInsertPoint); if (!BreakAfterGroup && !isAllowedLoad(InstToMove)) + { + PrintDump(VerbosityLevel::High, "Not part of unsplittable group and not a load. Place immediately.\n"); return StartInsertPoint; + } int Cnt = is2dBlockRead(InstToMove) ? IGC_GET_FLAG_VALUE(CodeSinking2dLoadSchedulingInstr) : IGC_GET_FLAG_VALUE(CodeSinkingLoadSchedulingInstr); @@ -2866,16 +2964,6 @@ namespace IGC { [InstToMove](auto &U) {return llvm::cast(&U) == InstToMove;})) break; - if (isPartOfUnsplittableGroup(I)) - { - BreakAfterGroup = true; - InsertPoint = I; - I = I->getPrevNode(); - continue; - } - else if (BreakAfterGroup) - break; - if (I->mayWriteToMemory()) { // At this point of the program we might have lost some information @@ -2888,11 +2976,21 @@ namespace IGC { } } - if (--Cnt <= 0) - break; - InsertPoint = I; I = I->getPrevNode(); + + if (isPartOfUnsplittableGroup(InsertPoint)) + { + BreakAfterGroup = true; + continue; + } + else + { + if (BreakAfterGroup) + break; + else if (--Cnt <= 0) + break; + } } return InsertPoint; }; diff --git a/IGC/Compiler/CISACodeGen/CodeSinking.hpp b/IGC/Compiler/CISACodeGen/CodeSinking.hpp index 827a7bec537f..1d0cdfea4116 100644 --- a/IGC/Compiler/CISACodeGen/CodeSinking.hpp +++ b/IGC/Compiler/CISACodeGen/CodeSinking.hpp @@ -207,7 +207,7 @@ namespace IGC { bool loopSink(llvm::Function& F); bool loopSink(llvm::Loop* LoopWithPressure, LoopSinkMode Mode); - bool localSink(llvm::BasicBlock* BB, InstToCandidateMap& InstToCandidate); + bool localSink(llvm::BasicBlock* BB, InstToCandidateMap& InstToCandidate, bool Aggressive=false); /// candidates creation bool tryCreateShufflePatternCandidates( diff --git a/IGC/Compiler/tests/CodeSinking/LoopSinking/2d-blockload-loopsink-sch-heur.ll b/IGC/Compiler/tests/CodeSinking/LoopSinking/2d-blockload-loopsink-sch-heur.ll index d19b8b66a127..3cdc8cf74809 100644 --- a/IGC/Compiler/tests/CodeSinking/LoopSinking/2d-blockload-loopsink-sch-heur.ll +++ b/IGC/Compiler/tests/CodeSinking/LoopSinking/2d-blockload-loopsink-sch-heur.ll @@ -6,7 +6,7 @@ ; ;============================ end_copyright_notice ============================= ; REQUIRES: regkeys, llvm-14-plus -; RUN: igc_opt --opaque-pointers --regkey CodeSinkingLoadSchedulingInstr=4 --regkey CodeSinking2dLoadSchedulingInstr=3 --regkey LoopSinkMinSave=1 --regkey LoopSinkEnable2dBlockReads=1 --regkey ForceLoopSink=1 --regkey CodeLoopSinkingMinSize=10 --regkey LoopSinkDisableRollback=1 %enable-basic-aa% --igc-code-loop-sinking -S %s 2>&1 | FileCheck %s +; RUN: igc_opt --opaque-pointers --regkey CodeSinkingLoadSchedulingInstr=3 --regkey CodeSinking2dLoadSchedulingInstr=2 --regkey LoopSinkMinSave=1 --regkey LoopSinkEnable2dBlockReads=1 --regkey ForceLoopSink=1 --regkey CodeLoopSinkingMinSize=10 --regkey LoopSinkDisableRollback=1 %enable-basic-aa% --igc-code-loop-sinking -S %s 2>&1 | FileCheck %s define void @sink1(ptr addrspace(1) %in0, ptr addrspace(1) noalias %out0, i32 %count, i32 %offsetIn0, <8 x i32> %r0) { ; CHECK-LABEL: @sink1( @@ -84,3 +84,5 @@ declare <32 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v32i16(i64, i32, i32, i32, i attributes #0 = { argmemonly nounwind readonly willreturn } !igc.functions = !{} + + diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h index 65d42a8f9b85..44cee53ec075 100644 --- a/IGC/common/igc_flags.h +++ b/IGC/common/igc_flags.h @@ -149,8 +149,8 @@ DECLARE_IGC_REGKEY(bool, PrepopulateLoadChainLoopSink, true, "Check the loop fo DECLARE_IGC_REGKEY(bool, EnableLoadChainLoopSink, true, "Allow sinking of load address calculation when the load was sinked to the loop, even if the needed regpressure is achieved (only single use instructions)", false) DECLARE_IGC_REGKEY(DWORD, LoopSinkRegpressureMargin, 10, "Sink into the loop until the pressure becomes less than #grf-margin", false) DECLARE_IGC_REGKEY(DWORD, CodeLoopSinkingMinSize, 100, "Don't sink in the loop if the number of instructions in the kernel is less", false) -DECLARE_IGC_REGKEY(DWORD, CodeSinkingLoadSchedulingInstr, 20, "Instructions number to step to schedule loads in advance before the load use to cover latency. 1 to insert it immediately before use", false) -DECLARE_IGC_REGKEY(DWORD, CodeSinking2dLoadSchedulingInstr, 5, "Instructions number to step to schedule 2d loads in advance before the load use to cover latency. 1 to insert it immediately before use", false) +DECLARE_IGC_REGKEY(DWORD, CodeSinkingLoadSchedulingInstr, 20, "Instructions number to step to schedule loads in advance before the load use to cover latency. 0 to insert it immediately before use", false) +DECLARE_IGC_REGKEY(DWORD, CodeSinking2dLoadSchedulingInstr, 5, "Instructions number to step to schedule 2d loads in advance before the load use to cover latency. 0 to insert it immediately before use", false) DECLARE_IGC_REGKEY(DWORD, LoopSinkMinSaveUniform, 6, "If loop sink can have save more scalar (uniform) values than this Minimum, do it; otherwise, skip", false) DECLARE_IGC_REGKEY(DWORD, LoopSinkMinSave, 1, "If loop sink can have save more 32-bit values than this Minimum, do it; otherwise, skip", false) DECLARE_IGC_REGKEY(DWORD, LoopSinkThresholdDelta, 30, "Do loop sink If the estimated register pressure is higher than this + #avaialble registers", false) @@ -162,6 +162,9 @@ DECLARE_IGC_REGKEY(bool, LoopSinkEnableVectorShuffle, true, "Allow sinking of DECLARE_IGC_REGKEY(bool, LoopSinkForceRollback, false, "Rollback every loop sinking change (for debug purposes only)", false) DECLARE_IGC_REGKEY(bool, LoopSinkDisableRollback, false, "Disable loopsink rollback completely (even in case of increased regpressure)", false) DECLARE_IGC_REGKEY(bool, LoopSinkAvoidSplittingDPAS, true, "Sink before the whole DPAS sequence if the first use of the sinked instruction is not the first DPAS", false) +DECLARE_IGC_REGKEY(bool, LoopSinkForce2dBlockReadsMaxSink, true, "Sink as much as possible in presence of 2d block loads", false) +DECLARE_IGC_REGKEY(bool, LoopSinkEnableLateRescheduling, false, "Schedule more aggressively in the end if the needed regpressure is still not achieved", false) + DECLARE_IGC_REGKEY(bool, EnableLoopHoistConstant, false, "Enables pass to check for specific loop patterns where variables are constant across all but the last iteration, and hoist them out of the loop.", false) DECLARE_IGC_REGKEY(bool, DisableCodeHoisting, false, "Setting this to 1/true adds a compiler switch to disable code-hoisting", false)