From 81f2331ccefe52fc93387421ffa3bb7b7e11e266 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 20 Oct 2017 12:11:34 -0700 Subject: [PATCH] [BOLT] Improve ICP for virtual method calls and jump tables using value profiling. Summary: Use value profiling data to remove the method pointer loads from vtables when doing ICP at virtual function and jump table callsites. The basic process is the following: 1. Work backwards from the callsite to find the most recent def of the call register. 2. Work back from the call register def to find the instruction where the vtable is loaded. 3. Find out of there is any value profiling data associated with the vtable load. If so, record all these addresses as potential vtables + method offsets. 4. Since the addresses extracted by #3 will be vtable + method offset, we need to figure out the method offset in order to determine the actual vtable base address. At this point I virtually execute all the instructions that occur between #3 and #2 that touch the method pointer register. The result of this execution should be the method offset. 5. Fetch the actual method address from the appropriate data section containing the vtable using the computed method offset. Make sure that this address maps to an actual function symbol. 6. Try to associate a vtable pointer with each target address in SymTargets. If every target has a vtable, then this is almost certainly a virtual method callsite. 7. Use the vtable address when generating the promoted call code. It's basically the same as regular ICP code except that the compare is against the vtable and not the method pointer. Additionally, the instructions to load up the method are dumped into the cold call block. For jump tables, the basic idea is the same. I use the memory profiling data to find the hottest slots in the jumptable and then use that information to compute the indices of the hottest entries. We can then compare the index register to the hot index values and avoid the load from the jump table. Note: I'm assuming the whole call is in a single BB. According to @rafaelauler, this isn't always the case on ARM. This also isn't always the case on X86 either. If there are non-trivial arguments that are passed by value, there could be branches in between the setup and the call. I'm going to leave fixing this until later since it makes things a bit more complicated. I've also fixed a bug where ICP was introducing a conditional tail call. I made sure that SCTC fixes these up afterwards. I have no idea why I made it introduce a CTC in the first place. (cherry picked from FBD6120768) --- bolt/BinaryBasicBlock.h | 8 + bolt/BinaryContext.cpp | 98 +++- bolt/BinaryContext.h | 37 +- bolt/BinaryFunction.cpp | 80 ++- bolt/BinaryFunction.h | 36 +- bolt/DataAggregator.cpp | 4 +- bolt/DataReader.cpp | 3 +- bolt/DataReader.h | 3 + bolt/Passes/BinaryPasses.cpp | 14 + bolt/Passes/FrameAnalysis.cpp | 4 +- bolt/Passes/IndirectCallPromotion.cpp | 768 ++++++++++++++++++++++---- bolt/Passes/IndirectCallPromotion.h | 44 +- bolt/RewriteInstance.cpp | 63 +-- 13 files changed, 1000 insertions(+), 162 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index f42a4ceb241a..49949c9263c0 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -642,6 +642,14 @@ class BinaryBasicBlock { return Instructions.erase(II); } + /// Erase instructions in the specified range. + template + void eraseInstructions(ItrType Begin, ItrType End) { + while (End > Begin) { + eraseInstruction(*--End); + } + } + /// Erase all instructions void clear() { Instructions.clear(); diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index c11f96375b09..e4bba784961a 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -36,8 +36,28 @@ PrintDebugInfo("print-debug-info", cl::Hidden, cl::cat(BoltCategory)); +static cl::opt +PrintRelocations("print-relocations", + cl::desc("print relocations when printing functions"), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +PrintMemData("print-mem-data", + cl::desc("print memory data annotations when printing functions"), + cl::Hidden, + cl::cat(BoltCategory)); + } // namespace opts +namespace llvm { +namespace bolt { +extern void check_error(std::error_code EC, StringRef Message); +} +} + +Triple::ArchType Relocation::Arch; + BinaryContext::~BinaryContext() { } MCObjectWriter *BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { @@ -326,7 +346,9 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, uint64_t Offset, const BinaryFunction* Function, - bool printMCInst) const { + bool PrintMCInst, + bool PrintMemData, + bool PrintRelocations) const { if (MIA->isEHLabel(Instruction)) { OS << " EH_LABEL: " << *MIA->getTargetSymbol(Instruction) << '\n'; return; @@ -392,24 +414,58 @@ void BinaryContext::printInstruction(raw_ostream &OS, } } - auto *MD = Function ? DR.getFuncMemData(Function->getNames()) : nullptr; - if (MD) { - bool DidPrint = false; - for (auto &MI : MD->getMemInfoRange(Offset)) { - OS << (DidPrint ? ", " : " # Loads: "); - OS << MI.Addr << "/" << MI.Count; - DidPrint = true; + if ((opts::PrintMemData || PrintMemData) && Function) { + const auto *MD = Function->getMemData(); + const auto MemDataOffset = + MIA->tryGetAnnotationAs(Instruction, "MemDataOffset"); + if (MD && MemDataOffset) { + bool DidPrint = false; + for (auto &MI : MD->getMemInfoRange(MemDataOffset.get())) { + OS << (DidPrint ? ", " : " # Loads: "); + OS << MI.Addr << "/" << MI.Count; + DidPrint = true; + } } } + if ((opts::PrintRelocations || PrintRelocations) && Function) { + const auto Size = computeCodeSize(&Instruction, &Instruction + 1); + Function->printRelocations(OS, Offset, Size); + } + OS << "\n"; - if (printMCInst) { + if (PrintMCInst) { Instruction.dump_pretty(OS, InstPrinter.get()); OS << "\n"; } } +ErrorOr> +BinaryContext::getFunctionData(const BinaryFunction &Function) const { + auto Section = Function.getSection(); + assert(Section.getAddress() <= Function.getAddress() && + Section.getAddress() + Section.getSize() + >= Function.getAddress() + Function.getSize() && + "wrong section for function"); + + if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + return std::make_error_code(std::errc::bad_address); + } + + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + + assert(SectionContents.size() == Section.getSize() && + "section size mismatch"); + + // Function offset from the section start. + auto FunctionOffset = Function.getAddress() - Section.getAddress(); + auto *Bytes = reinterpret_cast(SectionContents.data()); + return ArrayRef(Bytes + FunctionOffset, Function.getSize()); +} + ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ auto SI = AllocatableSections.upper_bound(Address); if (SI != AllocatableSections.begin()) { @@ -640,3 +696,27 @@ size_t Relocation::emit(MCStreamer *Streamer) const { } return Size; } + +#define ELF_RELOC(name, value) #name, + +void Relocation::print(raw_ostream &OS) const { + static const char *X86RelocNames[] = { +#include "llvm/Support/ELFRelocs/x86_64.def" + }; + static const char *AArch64RelocNames[] = { +#include "llvm/Support/ELFRelocs/AArch64.def" + }; + if (Arch == Triple::aarch64) + OS << AArch64RelocNames[Type]; + else + OS << X86RelocNames[Type]; + OS << ", 0x" << Twine::utohexstr(Offset); + if (Symbol) { + OS << ", " << Symbol->getName(); + } + if (int64_t(Addend) < 0) + OS << ", -0x" << Twine::utohexstr(-int64_t(Addend)); + else + OS << ", 0x" << Twine::utohexstr(Addend); + OS << ", 0x" << Twine::utohexstr(Value); +} diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index b01aa0236e20..3cc4f1442738 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -55,6 +55,7 @@ class DataReader; /// Relocation class. struct Relocation { + static Triple::ArchType Arch; /// for printing, set by BinaryContext ctor. uint64_t Offset; mutable MCSymbol *Symbol; /// mutable to allow modification by emitter. uint64_t Type; @@ -78,6 +79,9 @@ struct Relocation { /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. size_t emit(MCStreamer *Streamer) const; + + /// Print a relocation to \p OS. + void print(raw_ostream &OS) const; }; /// Relocation ordering by offset. @@ -85,6 +89,11 @@ inline bool operator<(const Relocation &A, const Relocation &B) { return A.Offset < B.Offset; } +inline raw_ostream &operator<<(raw_ostream &OS, const Relocation &Rel) { + Rel.print(OS); + return OS; +} + class BinaryContext { BinaryContext() = delete; @@ -199,7 +208,9 @@ class BinaryContext { MIA(std::move(MIA)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)), - DR(DR) {} + DR(DR) { + Relocation::Arch = this->TheTriple->getArch(); + } ~BinaryContext(); @@ -215,6 +226,15 @@ class BinaryContext { /// global symbol was registered at the location. MCSymbol *getGlobalSymbolAtAddress(uint64_t Address) const; + /// Find the address of the global symbol with the given \p Name. + /// return an error if no such symbol exists. + ErrorOr getAddressForGlobalSymbol(StringRef Name) const { + auto Itr = GlobalSymbols.find(Name); + if (Itr != GlobalSymbols.end()) + return Itr->second; + return std::make_error_code(std::errc::bad_address); + } + /// Return MCSymbol for the given \p Name or nullptr if no /// global symbol with that name exists. MCSymbol *getGlobalSymbolByName(const std::string &Name) const; @@ -222,6 +242,10 @@ class BinaryContext { /// Print the global symbol table. void printGlobalSymbols(raw_ostream& OS) const; + /// Get the raw bytes for a given function. + ErrorOr> + getFunctionData(const BinaryFunction &Function) const; + /// Return (allocatable) section containing the given \p Address. ErrorOr getSectionForAddress(uint64_t Address) const; @@ -340,7 +364,9 @@ class BinaryContext { const MCInst &Instruction, uint64_t Offset = 0, const BinaryFunction *Function = nullptr, - bool printMCInst = false) const; + bool PrintMCInst = false, + bool PrintMemData = false, + bool PrintRelocations = false) const; /// Print a range of instructions. template @@ -349,9 +375,12 @@ class BinaryContext { Itr End, uint64_t Offset = 0, const BinaryFunction *Function = nullptr, - bool printMCInst = false) const { + bool PrintMCInst = false, + bool PrintMemData = false, + bool PrintRelocations = false) const { while (Begin != End) { - printInstruction(OS, *Begin, Offset, Function, printMCInst); + printInstruction(OS, *Begin, Offset, Function, PrintMCInst, + PrintMemData, PrintRelocations); Offset += computeCodeSize(Begin, Begin + 1); ++Begin; } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index cfac6193d5de..3852f7fd1415 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Regex.h" #include #include #include @@ -137,8 +138,16 @@ PrintOnly("print-only", cl::Hidden, cl::cat(BoltCategory)); +static cl::list +PrintOnlyRegex("print-only-regex", + cl::CommaSeparated, + cl::desc("list of function regexes to print"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + bool shouldPrint(const BinaryFunction &Function) { - if (PrintOnly.empty()) + if (PrintOnly.empty() && PrintOnlyRegex.empty()) return true; for (auto &Name : opts::PrintOnly) { @@ -147,6 +156,12 @@ bool shouldPrint(const BinaryFunction &Function) { } } + for (auto &Name : opts::PrintOnlyRegex) { + if (Function.hasNameRegex(Name)) { + return true; + } + } + return false; } @@ -160,6 +175,11 @@ constexpr unsigned BinaryFunction::MinAlign; namespace { +template +bool emptyRange(const R &Range) { + return Range.begin() == Range.end(); +} + /// Gets debug line information for the instruction located at the given /// address in the original binary. The SMLoc's pointer is used /// to point to this information, which is represented by a @@ -227,6 +247,14 @@ bool DynoStats::lessThan(const DynoStats &Other, uint64_t BinaryFunction::Count = 0; +bool BinaryFunction::hasNameRegex(const std::string &NameRegex) const { + Regex MatchName(NameRegex); + for (auto &Name : Names) + if (MatchName.match(Name)) + return true; + return false; +} + BinaryBasicBlock * BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (Offset > Size) @@ -558,6 +586,31 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "End of Function \"" << *this << "\"\n\n"; } +void BinaryFunction::printRelocations(raw_ostream &OS, + uint64_t Offset, + uint64_t Size) const { + const char* Sep = " # Relocs: "; + + auto RI = Relocations.lower_bound(Offset); + while (RI != Relocations.end() && RI->first < Offset + Size) { + OS << Sep << "(R: " << RI->second << ")"; + Sep = ", "; + ++RI; + } + + RI = MoveRelocations.lower_bound(Offset); + while (RI != MoveRelocations.end() && RI->first < Offset + Size) { + OS << Sep << "(M: " << RI->second << ")"; + Sep = ", "; + ++RI; + } + + auto PI = PCRelativeRelocationOffsets.lower_bound(Offset); + if (PI != PCRelativeRelocationOffsets.end() && *PI < Offset + Size) { + OS << Sep << "(pcrel)"; + } +} + IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size, uint64_t Offset) { @@ -566,7 +619,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // An instruction referencing memory used by jump instruction (directly or // via register). This location could be an array of function pointers // in case of indirect tail call, or a jump table. - const MCInst *MemLocInstr; + MCInst *MemLocInstr; // Address of the table referenced by MemLocInstr. Could be either an // array of function pointers, or a jump table. @@ -834,6 +887,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { DWARFUnitLineTable ULT = getDWARFUnitLineTable(); + matchProfileMemData(); + // Insert a label at the beginning of the function. This will be our first // basic block. Labels[0] = Ctx->createTempSymbol("BB0", false); @@ -1181,6 +1236,10 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, ULT)); } + if (MemData && !emptyRange(MemData->getMemInfoRange(Offset))) { + MIA->addAnnotation(Ctx.get(), Instruction, "MemDataOffset", Offset); + } + addInstruction(Offset, std::move(Instruction)); } @@ -1892,6 +1951,23 @@ bool BinaryFunction::fetchProfileForOtherEntryPoints() { return Updated; } +void BinaryFunction::matchProfileMemData() { + const auto AllMemData = BC.DR.getFuncMemDataRegex(getNames()); + for (auto *NewMemData : AllMemData) { + // Prevent functions from sharing the same profile. + if (NewMemData->Used) + continue; + + if (MemData) + MemData->Used = false; + + // Update function profile data with the new set. + MemData = NewMemData; + MemData->Used = true; + break; + } +} + void BinaryFunction::matchProfileData() { // This functionality is available for LBR-mode only // TODO: Implement evaluateProfileData() for samples, checking whether diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 01a981c73813..c803f4c849e5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -296,6 +296,9 @@ class BinaryFunction { /// Profile data for branches. FuncBranchData *BranchData{nullptr}; + /// Profile data for memory loads. + FuncMemData *MemData{nullptr}; + /// Profile match ratio for BranchData. float ProfileMatchRatio{0.0f}; @@ -453,7 +456,7 @@ class BinaryFunction { LabelsMapType Labels; /// Temporary holder of instructions before CFG is constructed. - /// Map offset in the function to MCInst. + /// Map offset in the function to MCInst index. using InstrMapType = std::map; InstrMapType InstructionOffsets; std::vector Instructions; @@ -1014,6 +1017,10 @@ class BinaryFunction { return false; } + /// Check if (possibly one out of many) function name matches the given + /// regex. + bool hasNameRegex(const std::string &NameRegex) const; + /// Return a vector of all possible names for the function. const std::vector &getNames() const { return Names; @@ -1455,6 +1462,10 @@ class BinaryFunction { void print(raw_ostream &OS, std::string Annotation = "", bool PrintInstructions = true) const; + /// Print all relocations between \p Offset and \p Offset + \p Size in + /// this function. + void printRelocations(raw_ostream &OS, uint64_t Offset, uint64_t Size) const; + /// Return true if function has a profile, even if the profile does not /// match CFG 100%. bool hasProfile() const { @@ -1821,6 +1832,10 @@ class BinaryFunction { /// blocks. void matchProfileData(); + /// Find the best matching memory data profile for a function before the + /// creation of basic blocks. + void matchProfileMemData(); + /// Check how closely the profile data matches the function and set /// Return accuracy (ranging from 0.0 to 1.0) of matching. float evaluateProfileData(const FuncBranchData &BranchData); @@ -1831,15 +1846,34 @@ class BinaryFunction { return BranchData; } + /// Return profile data associated with this function, or nullptr if the + /// function has no associated profile. FuncBranchData *getBranchData() { return BranchData; } + /// Return memory profile data associated with this function, or nullptr + /// if the function has no associated profile. + const FuncMemData *getMemData() const { + return MemData; + } + + /// Return memory profile data associated with this function, or nullptr + /// if the function has no associated profile. + FuncMemData *getMemData() { + return MemData; + } + /// Updates profile data associated with this function void setBranchData(FuncBranchData *Data) { BranchData = Data; } + /// Updates the memory profile data associated with this function + void setMemData(FuncMemData *Data) { + MemData = Data; + } + /// Walks the list of basic blocks filling in missing information about /// edge frequency for fall-throughs. /// diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index 8912657f2305..dbce1e4ec465 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -821,7 +821,9 @@ std::error_code DataAggregator::parseMemEvents() { }); if (Func) { - FuncsToMemEvents[FuncName].update(FuncLoc, AddrLoc); + auto *MemData = &FuncsToMemEvents[FuncName]; + Func->setMemData(MemData); + MemData->update(FuncLoc, AddrLoc); DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n"); } } diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 7a0eb57d3ef3..c58919b2a360 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -306,7 +306,8 @@ void MemInfo::print(raw_ostream &OS) const { iterator_range FuncMemData::getMemInfoRange(uint64_t Offset) const { - assert(std::is_sorted(Data.begin(), Data.end())); + // Commented out because it can be expensive. + //assert(std::is_sorted(Data.begin(), Data.end())); struct Compare { bool operator()(const MemInfo &MI, const uint64_t Val) const { return MI.Offset.Offset < Val; diff --git a/bolt/DataReader.h b/bolt/DataReader.h index a0623e61aa41..852e6f177417 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -222,6 +222,9 @@ struct FuncMemData { StringRef Name; ContainerTy Data; + /// Indicate if the data was used. + bool Used{false}; + DenseMap> EventIndex; /// Find all the memory events originating at Offset. diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 3ad608edab32..8dbaba51922d 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -194,6 +194,7 @@ Peepholes("peepholes", cl::desc("enable peephole optimizations"), cl::value_desc("opt1,opt2,opt3,..."), cl::values( + clEnumValN(PEEP_NONE, "none", "disable peepholes"), clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", "remove double jumps when able"), @@ -566,6 +567,13 @@ void FinalizeFunctions::runOnFunctions( auto &Function = It.second; const auto ShouldOptimize = shouldOptimize(Function); + // Strip all annotations. + for (auto &BB : Function) { + for (auto &Inst : BB) { + BC.MIA->removeAllAnnotations(Inst); + } + } + // Always fix functions in relocation mode. if (!opts::Relocs && !ShouldOptimize) continue; @@ -632,12 +640,18 @@ uint64_t fixDoubleJumps(BinaryContext &BC, // We must patch up any existing branch instructions to match up // with the new successor. auto *Ctx = BC.Ctx.get(); + assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && + "Predecessor block has inconsistent number of successors"); if (CondBranch && BC.MIA->getTargetSymbol(*CondBranch) == BB.getLabel()) { BC.MIA->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); } else if (UncondBranch && BC.MIA->getTargetSymbol(*UncondBranch) == BB.getLabel()) { BC.MIA->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); + } else if (!UncondBranch) { + assert(Function.getBasicBlockAfter(Pred, false) != Succ && + "Don't add an explicit jump to a fallthrough block."); + Pred->addBranchInstruction(Succ); } } else { // Succ will be null in the tail call case. In this case we diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 9418eecad3b6..a4e157192775 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -534,12 +534,12 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, } void FrameAnalysis::printStats() { - outs() << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized + outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized << " function(s) " << format("(%.1lf%% dyn cov)", (100.0 * CountFunctionsNotOptimized / CountDenominator)) << " were not optimized.\n" - << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI + << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI << " function(s) " << format("(%.1lf%% dyn cov)", (100.0 * CountFunctionsFailedRestoreFI / CountDenominator)) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 8f3e3986cf00..25f1a678c30a 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -12,8 +12,10 @@ #include "IndirectCallPromotion.h" #include "DataflowInfoManager.h" #include "llvm/Support/Options.h" +#include #define DEBUG_TYPE "ICP" +#define DEBUG_VERBOSE(Level, X) if (opts::Verbosity >= (Level)) { X; } using namespace llvm; using namespace bolt; @@ -74,6 +76,52 @@ IndirectCallPromotionTopN( cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +IndirectCallPromotionCallsTopN( + "indirect-call-promotion-calls-topn", + cl::desc("number of targets to consider when doing indirect " + "call promotion on calls"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionJumpTablesTopN( + "indirect-call-promotion-jump-tables-topn", + cl::desc("number of targets to consider when doing indirect " + "call promotion on jump tables"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +EliminateLoads( + "icp-eliminate-loads", + cl::desc("enable load elimination using memory profiling data when " + "performing ICP"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +ICPAlwaysOn( + "icp-always-on", + cl::desc("enable ICP for all eligible callsites"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +ICPTopCallsites( + "icp-top-callsites", + cl::desc("only optimize calls that contribute to this percentage of all " + "indirect calls"), + cl::init(0), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::list ICPFuncsList("icp-funcs", cl::CommaSeparated, @@ -194,6 +242,11 @@ IndirectCallPromotion::getCallTargets( Targets.erase(Result, Targets.end()); } else { + // Don't try to optimize PC relative indirect calls. + if (Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) { + return Targets; + } const auto *BranchData = BF.getBranchData(); assert(BranchData && "expected initialized branch data"); auto Offset = BC.MIA->getAnnotationAs(Inst, "Offset"); @@ -247,37 +300,364 @@ IndirectCallPromotion::getCallTargets( return Targets; } -std::vector> +IndirectCallPromotion::JumpTableInfoType +IndirectCallPromotion::maybeGetHotJumpTableTargets( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &CallInst, + MCInst *&TargetFetchInst, + const BinaryFunction::JumpTable *JT, + const std::vector &Targets +) const { + const auto *MemData = Function.getMemData(); + JumpTableInfoType HotTargets; + + assert(JT && "Can't get jump table addrs for non-jump tables."); + + if (!MemData || !opts::EliminateLoads) + return JumpTableInfoType(); + + MCInst *MemLocInstr; + MCInst *PCRelBaseOut; + unsigned BaseReg, IndexReg; + int64_t DispValue; + const MCExpr *DispExpr; + MutableArrayRef Insts(&BB->front(), &CallInst); + const auto Type = BC.MIA->analyzeIndirectBranch(CallInst, + Insts, + BC.AsmInfo->getPointerSize(), + MemLocInstr, + BaseReg, + IndexReg, + DispValue, + DispExpr, + PCRelBaseOut); + + assert(MemLocInstr && "There should always be a load for jump tables"); + if (!MemLocInstr) + return JumpTableInfoType(); + + DEBUG({ + dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for " + << "jump table in " << Function << " at @ " + << (&CallInst - &BB->front()) << "\n" + << "BOLT-INFO: ICP target fetch instructions:\n"; + BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); + if (MemLocInstr != &CallInst) { + BC.printInstruction(dbgs(), CallInst, 0, &Function); + } + }); + + DEBUG_VERBOSE(1, { + dbgs() << "Jmp info: Type = " << (unsigned)Type << ", " + << "BaseReg = " << BC.MRI->getName(BaseReg) << ", " + << "IndexReg = " << BC.MRI->getName(IndexReg) << ", " + << "DispValue = " << Twine::utohexstr(DispValue) << ", " + << "DispExpr = " << DispExpr << ", " + << "MemLocInstr = "; + BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); + dbgs() << "\n"; + }); + + ++TotalIndexBasedCandidates; + + // Try to get value profiling data for the method load instruction. + auto DataOffset = BC.MIA->tryGetAnnotationAs(*MemLocInstr, + "MemDataOffset"); + + if (!DataOffset) { + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n"); + return JumpTableInfoType(); + } + + uint64_t ArrayStart; + if (DispExpr) { + auto SI = BC.GlobalSymbols.find(DispExpr->getSymbol().getName()); + assert(SI != BC.GlobalSymbols.end() && "global symbol needs a value"); + ArrayStart = SI->second; + } else { + ArrayStart = static_cast(DispValue); + } + + if (BaseReg == BC.MRI->getProgramCounter()) { + auto FunctionData = BC.getFunctionData(Function); + const uint64_t Address = Function.getAddress() + DataOffset.get(); + MCInst OrigJmp; + uint64_t Size; + assert(FunctionData); + auto Success = BC.DisAsm->getInstruction(OrigJmp, + Size, + *FunctionData, + Address, + nulls(), + nulls()); + assert(Success && "Must be able to disassmble original jump instruction"); + ArrayStart += Address + Size; + } + + for (const auto &MI : MemData->getMemInfoRange(DataOffset.get())) { + size_t Index; + if (MI.Addr.Offset % JT->EntrySize != 0) // ignore bogus data + continue; + + if (MI.Addr.IsSymbol) { + // Deal with bad/stale data + if (MI.Addr.Name != (std::string("JUMP_TABLEat0x") + + Twine::utohexstr(JT->Address).str()) && + MI.Addr.Name != (std::string("JUMP_TABLEat0x") + + Twine::utohexstr(ArrayStart).str())) + continue; + Index = MI.Addr.Offset / JT->EntrySize; + } else { + Index = (MI.Addr.Offset - ArrayStart) / JT->EntrySize; + } + + // If Index is out of range it probably means the memory profiling data is + // wrong for this instruction, bail out. + if (Index >= JT->getSize()) + continue; + + assert(std::accumulate(Targets.begin(), + Targets.end(), + false, + [Index](bool Found, const Callsite &CS) { + return (Found || + std::find(CS.JTIndex.begin(), + CS.JTIndex.end(), + Index) != CS.JTIndex.end()); + }) && + "hot indices must be referred to by at least one callsite"); + + HotTargets.emplace_back(std::make_pair(MI.Count, Index)); + } + + // Sort with highest counts first. + std::sort(HotTargets.rbegin(), HotTargets.rend()); + + DEBUG({ + dbgs() << "BOLT-INFO: ICP jump table hot targets:\n"; + for (const auto &Target : HotTargets) { + dbgs() << "BOLT-INFO: Idx = " << Target.second << ", " + << "Count = " << Target.first << "\n"; + } + }); + + BC.MIA->getOrCreateAnnotationAs(BC.Ctx.get(), + CallInst, + "JTIndexReg") = IndexReg; + + TargetFetchInst = MemLocInstr; + + return HotTargets; +} + +IndirectCallPromotion::SymTargetsType IndirectCallPromotion::findCallTargetSymbols( BinaryContext &BC, - const std::vector &Targets, - const size_t N + std::vector &Targets, + const size_t N, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &CallInst, + MCInst *&TargetFetchInst ) const { - std::vector> SymTargets; - - size_t TgtIdx = 0; - for (size_t I = 0; I < N; ++TgtIdx) { - assert(Targets[TgtIdx].To.IsSymbol && "All ICP targets must be to known symbols"); - if (Targets[TgtIdx].JTIndex.empty()) { - SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, 0)); - ++I; + const auto *JT = Function.getJumpTable(CallInst); + SymTargetsType SymTargets; + + if (JT) { + std::vector NewTargets; + std::set ToDelete; + + auto findTargetSymbol = + [&](uint64_t Index, const std::vector &Targets) -> MCSymbol * { + size_t Idx = 0; + for (const auto &CS : Targets) { + assert(CS.To.IsSymbol && "All ICP targets must be to known symbols"); + assert(!CS.JTIndex.empty()); + if (std::find(CS.JTIndex.begin(), CS.JTIndex.end(), Index) != + CS.JTIndex.end()) { + ToDelete.insert(Idx); + NewTargets.push_back(CS); + // Since we know the hot index, delete the rest. + NewTargets.back().JTIndex.clear(); + NewTargets.back().JTIndex.push_back(Index); + return CS.To.Sym; + } + ++Idx; + } + return nullptr; + }; + + auto HotTargets = maybeGetHotJumpTableTargets(BC, + Function, + BB, + CallInst, + TargetFetchInst, + JT, + Targets); + + if (!HotTargets.empty()) { + HotTargets.resize(std::min(N, HotTargets.size())); + for (const auto &HT : HotTargets) { + auto *Sym = findTargetSymbol(HT.second, Targets); + assert(Sym); + SymTargets.push_back(std::make_pair(Sym, HT.second)); + } + for (size_t I = 0; I < Targets.size(); ++I) { + if (ToDelete.count(I) == 0) + NewTargets.push_back(Targets[I]); + } + std::swap(NewTargets, Targets); } else { - for (auto Idx : Targets[TgtIdx].JTIndex) { - SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, Idx)); - ++I; + for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { + assert(Targets[TgtIdx].To.IsSymbol && + "All ICP targets must be to known symbols"); + assert(!Targets[TgtIdx].JTIndex.empty() && + "Jump tables must have indices"); + for (auto Idx : Targets[TgtIdx].JTIndex) { + SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, Idx)); + ++I; + } } } + } else { + for (size_t I = 0; I < N; ++I) { + assert(Targets[I].To.IsSymbol && + "All ICP targets must be to known symbols"); + assert(Targets[I].JTIndex.empty() && + "Can't have jump table indices for non-jump tables"); + SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0)); + } } return SymTargets; } +IndirectCallPromotion::MethodInfoType +IndirectCallPromotion::maybeGetVtableAddrs( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + const SymTargetsType &SymTargets +) const { + const auto *MemData = Function.getMemData(); + std::vector VtableAddrs; + std::vector MethodFetchInsns; + unsigned VtableReg, MethodReg; + uint64_t MethodOffset; + + assert(!Function.getJumpTable(Inst) && + "Can't get vtable addrs for jump tables."); + + if (!MemData || !opts::EliminateLoads) + return MethodInfoType(); + + MutableArrayRef Insts(&BB->front(), &Inst + 1); + if (!BC.MIA->analyzeVirtualMethodCall(Insts, + MethodFetchInsns, + VtableReg, + MethodReg, + MethodOffset)) { + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in " + << Function << " at @ " << (&Inst - &BB->front()) + << "\n"); + return MethodInfoType(); + } + + ++TotalMethodLoadEliminationCandidates; + + DEBUG_VERBOSE(1, + dbgs() << "BOLT-INFO: ICP found virtual method call in " + << Function << " at @ " << (&Inst - &BB->front()) << "\n"; + dbgs() << "BOLT-INFO: ICP method fetch instructions:\n"; + for (auto *Inst : MethodFetchInsns) { + BC.printInstruction(dbgs(), *Inst, 0, &Function); + } + if (MethodFetchInsns.back() != &Inst) { + BC.printInstruction(dbgs(), Inst, 0, &Function); + } + ); + + // Try to get value profiling data for the method load instruction. + auto DataOffset = BC.MIA->tryGetAnnotationAs(*MethodFetchInsns.back(), + "MemDataOffset"); + + if (!DataOffset) { + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n"); + return MethodInfoType(); + } + + // Find the vtable that each method belongs to. + std::map MethodToVtable; + + for (auto &MI : MemData->getMemInfoRange(DataOffset.get())) { + ErrorOr Address = MI.Addr.IsSymbol + ? BC.getAddressForGlobalSymbol(MI.Addr.Name) + : MI.Addr.Offset; + + // Ignore bogus data. + if (!Address) + continue; + + if (MI.Addr.IsSymbol) + Address = Address.get() + MI.Addr.Offset; + + const auto VtableBase = Address.get() - MethodOffset; + + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = " + << Twine::utohexstr(VtableBase) + << "+" << MethodOffset << "/" << MI.Count + << "\n"); + + if (auto MethodAddr = BC.extractPointerAtAddress(Address.get())) { + auto *MethodSym = BC.getGlobalSymbolAtAddress(MethodAddr.get()); + MethodToVtable[MethodSym] = VtableBase; + DEBUG_VERBOSE(1, + const auto *Method = BC.getFunctionForSymbol(MethodSym); + dbgs() << "BOLT-INFO: ICP found method = " + << Twine::utohexstr(MethodAddr.get()) << "/" + << (Method ? Method->getPrintName() : "") << "\n"; + ); + } + } + + // Find the vtable for each target symbol. + for (size_t I = 0; I < SymTargets.size(); ++I) { + auto Itr = MethodToVtable.find(SymTargets[I].first); + if (Itr != MethodToVtable.end()) { + VtableAddrs.push_back(Itr->second); + } else { + // Give up if we can't find the vtable for a method. + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " + << SymTargets[I].first->getName() << "\n"); + return MethodInfoType(); + } + } + + // Make sure the vtable reg is not clobbered by the argument passing code + if (VtableReg != MethodReg) { + for (auto *CurInst = MethodFetchInsns.front(); CurInst < &Inst; ++CurInst) { + const auto &InstrInfo = BC.MII->get(CurInst->getOpcode()); + if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) { + return MethodInfoType(); + } + } + } + + return MethodInfoType(VtableAddrs, MethodFetchInsns); +} + std::vector> -IndirectCallPromotion::rewriteCall(BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *IndCallBlock, - const MCInst &CallInst, - MCInstrAnalysis::ICPdata &&ICPcode) const { +IndirectCallPromotion::rewriteCall( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *IndCallBlock, + const MCInst &CallInst, + MCInstrAnalysis::ICPdata &&ICPcode, + const std::vector &MethodFetchInsns +) const { // Create new basic blocks with correct code in each one first. std::vector> NewBBs; const bool IsTailCallOrJT = (BC.MIA->isTailCall(CallInst) || @@ -289,7 +669,7 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, // Remember any pseudo instructions following a tail call. These // must be preserved and moved to the original block. std::vector TailInsts; - const auto *TailInst= &CallInst; + const auto *TailInst = &CallInst; if (IsTailCallOrJT) { while (TailInst + 1 < &(*IndCallBlock->end()) && BC.MII->get((TailInst + 1)->getOpcode()).isPseudo()) { @@ -299,7 +679,16 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, auto MovedInst = IndCallBlock->splitInstructions(&CallInst); - IndCallBlock->replaceInstruction(&CallInst, ICPcode.front().second); + IndCallBlock->eraseInstructions(MethodFetchInsns.begin(), + MethodFetchInsns.end()); + if (IndCallBlock->empty() || + (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) { + IndCallBlock->addInstructions(ICPcode.front().second.begin(), + ICPcode.front().second.end()); + } else { + IndCallBlock->replaceInstruction(&IndCallBlock->back(), + ICPcode.front().second); + } IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end()); for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { @@ -319,8 +708,6 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, // the merge block. if (!IsTailCallOrJT) { NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); - } else { - // assert(MovedInst.empty()); empty or just CFI } return NewBBs; @@ -381,7 +768,8 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( std::vector BBI; for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) { const auto BranchPct = (double)Itr->Branches / TotalIndirectBranches; - const auto MispredPct = (double)Itr->Mispreds / TotalIndirectMispreds; + const auto MispredPct = + (double)Itr->Mispreds / std::max(TotalIndirectMispreds, 1ul); if (Itr->JTIndex.empty()) { BBI.push_back(BinaryBranchInfo{uint64_t(TotalCount * BranchPct), uint64_t(TotalMispreds * MispredPct)}); @@ -402,10 +790,8 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( ++BI; }; - if (IsTailCall || IsJumpTable) { - if (IsJumpTable) { - moveSuccessors(IndCallBlock, NewBBs.back().get()); - } + if (IsJumpTable) { + moveSuccessors(IndCallBlock, NewBBs.back().get()); std::vector SymTargets; for (size_t I = 0; I < Targets.size(); ++I) { @@ -421,10 +807,8 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( // Fix up successors and execution counts. updateCurrentBranchInfo(); - if (IsJumpTable) { - auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); - IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch - } + auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); + IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch IndCallBlock->addSuccessor(NewBBs[0].get(), TotalCount); // fallthru branch for (size_t I = 0; I < NewBBs.size() - 1; ++I) { @@ -432,39 +816,40 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( TotalCount <= uint64_t(TotalIndirectBranches)); uint64_t ExecCount = BBI[I+1].Count; updateCurrentBranchInfo(); - if (IsJumpTable) { - auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); - NewBBs[I]->addSuccessor(Succ, BBI[I+1]); - } + auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); + NewBBs[I]->addSuccessor(Succ, BBI[I+1]); NewBBs[I]->addSuccessor(NewBBs[I+1].get(), TotalCount); // fallthru ExecCount += TotalCount; NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); NewBBs[I]->setIsCold(IndCallBlock->isCold()); NewBBs[I]->setExecutionCount(ExecCount); } - } else { assert(NewBBs.size() >= 2); assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty()); - assert(NewBBs.size() % 2 == 1); - - MergeBlock = NewBBs.back().get(); + assert(NewBBs.size() % 2 == 1 || IsTailCall); - moveSuccessors(IndCallBlock, MergeBlock); + if (!IsTailCall) { + MergeBlock = NewBBs.back().get(); + moveSuccessors(IndCallBlock, MergeBlock); + } // Fix up successors and execution counts. updateCurrentBranchInfo(); IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // cond branch IndCallBlock->addSuccessor(NewBBs[0].get(), BBI[0]); // uncond branch - for (size_t I = 0; I < NewBBs.size() - 2; ++I) { + const size_t Adj = IsTailCall ? 1 : 2; + for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { assert(TotalCount <= IndCallBlock->getExecutionCount() || TotalCount <= uint64_t(TotalIndirectBranches)); uint64_t ExecCount = BBI[(I+1)/2].Count; NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); NewBBs[I]->setIsCold(IndCallBlock->isCold()); if (I % 2 == 0) { - NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond + if (MergeBlock) { + NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond + } } else { assert(I + 2 < NewBBs.size()); updateCurrentBranchInfo(); @@ -475,16 +860,18 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( NewBBs[I]->setExecutionCount(ExecCount); } - // Arrange for the MergeBlock to be the fallthrough for the first - // promoted call block. - MergeBlock->setCanOutline(IndCallBlock->canOutline()); - MergeBlock->setIsCold(IndCallBlock->isCold()); - std::unique_ptr MBPtr; - std::swap(MBPtr, NewBBs.back()); - NewBBs.pop_back(); - NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); - // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? - NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch + if (MergeBlock) { + // Arrange for the MergeBlock to be the fallthrough for the first + // promoted call block. + MergeBlock->setCanOutline(IndCallBlock->canOutline()); + MergeBlock->setIsCold(IndCallBlock->isCold()); + std::unique_ptr MBPtr; + std::swap(MBPtr, NewBBs.back()); + NewBBs.pop_back(); + NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); + // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? + NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch + } } // cold call block @@ -507,7 +894,7 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, uint64_t NumCalls) { const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst); - // If we have no targets (or no calls), skip this callsite. + // If we have no targets (or no calls), skip this callsite. if (Targets.empty() || !NumCalls) { if (opts::Verbosity >= 1) { const auto InstIdx = &Inst - &(*BB->begin()); @@ -519,15 +906,21 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, return 0; } - const auto TrialN = std::min(size_t(opts::IndirectCallPromotionTopN), - Targets.size()); + size_t TopN = opts::IndirectCallPromotionTopN; + if (IsJumpTable) { + if (opts::IndirectCallPromotionJumpTablesTopN != 0) + TopN = opts::IndirectCallPromotionJumpTablesTopN; + } else if (opts::IndirectCallPromotionCallsTopN != 0) { + TopN = opts::IndirectCallPromotionCallsTopN; + } + const auto TrialN = std::min(TopN, Targets.size()); - if (!opts::ICPFuncsList.empty()) { - for (auto &Name : opts::ICPFuncsList) { - if (BB->getFunction()->hasName(Name)) - return TrialN; - } - return 0; + if (opts::ICPAlwaysOn) + return TrialN; + + if (opts::ICPTopCallsites > 0) { + auto &BC = BB->getFunction()->getBinaryContext(); + return BC.MIA->hasAnnotation(Inst, "DoICP") ? TrialN : 0; } // Pick the top N targets. @@ -606,29 +999,37 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, // Don't check misprediction frequency for jump tables -- we don't really // care as long as we are saving loads from the jump table. - if (IsJumpTable && !opts::ICPJumpTablesByTarget) - return N; - - // Compute the misprediction frequency of the top N call targets. If - // this frequency is less than the threshold, we should skip ICP at - // this callsite. - const double TopNMispredictFrequency = - (100.0 * TotalMispredictsTopN) / NumCalls; - - if (TopNMispredictFrequency < - opts::IndirectCallPromotionMispredictThreshold) { - if (opts::Verbosity >= 1) { - const auto InstIdx = &Inst - &(*BB->begin()); - outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " - << InstIdx << " in " << BB->getName() << ", calls = " - << NumCalls << ", top N mispredict frequency " - << format("%.1f", TopNMispredictFrequency) << "% < " - << opts::IndirectCallPromotionMispredictThreshold << "%\n"; + if (!IsJumpTable || opts::ICPJumpTablesByTarget) { + // Compute the misprediction frequency of the top N call targets. If + // this frequency is less than the threshold, we should skip ICP at + // this callsite. + const double TopNMispredictFrequency = + (100.0 * TotalMispredictsTopN) / NumCalls; + + if (TopNMispredictFrequency < + opts::IndirectCallPromotionMispredictThreshold) { + if (opts::Verbosity >= 1) { + const auto InstIdx = &Inst - &(*BB->begin()); + outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " + << InstIdx << " in " << BB->getName() << ", calls = " + << NumCalls << ", top N mispredict frequency " + << format("%.1f", TopNMispredictFrequency) << "% < " + << opts::IndirectCallPromotionMispredictThreshold << "%\n"; + } + return 0; } - return 0; } } + // Filter functions that can have ICP applied (for debugging) + if (!opts::ICPFuncsList.empty()) { + for (auto &Name : opts::ICPFuncsList) { + if (BB->getFunction()->hasName(Name)) + return N; + } + return 0; + } + return N; } @@ -662,6 +1063,11 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, << ", mispreds = " << Targets[I].Mispreds << ", taken freq = " << format("%.1f", Frequency) << "%" << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; + bool First = true; + for (auto JTIndex : Targets[I].JTIndex) { + outs() << (First ? ", indices = " : ", ") << JTIndex; + First = false; + } } outs() << "\n"; @@ -679,6 +1085,13 @@ void IndirectCallPromotion::runOnFunctions( if (opts::IndirectCallPromotion == ICP_NONE) return; + const bool OptimizeCalls = + (opts::IndirectCallPromotion == ICP_CALLS || + opts::IndirectCallPromotion == ICP_ALL); + const bool OptimizeJumpTables = + (opts::IndirectCallPromotion == ICP_JUMP_TABLES || + opts::IndirectCallPromotion == ICP_ALL); + std::unique_ptr RA; std::unique_ptr CG; if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) { @@ -686,6 +1099,111 @@ void IndirectCallPromotion::runOnFunctions( RA.reset(new RegAnalysis(BC, BFs, *CG)); } + DEBUG_VERBOSE(2, { + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + const auto *MemData = Function.getMemData(); + bool DidPrintFunc = false; + uint64_t Offset = 0; + + if (!MemData || !Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + for (auto &BB : Function) { + bool PrintBB = false; + for (auto &Inst : BB) { + if (auto Mem = + BC.MIA->tryGetAnnotationAs(Inst, "MemDataOffset")) { + for (auto &MI : MemData->getMemInfoRange(Mem.get())) { + if (MI.Addr.IsSymbol) { + PrintBB = true; + break; + } + if (auto Section = BC.getSectionForAddress(MI.Addr.Offset)) { + PrintBB = true; + break; + } + } + } + } + if (PrintBB && !DidPrintFunc) { + dbgs() << "\nNon-heap/stack memory data found in " + << Function << ":\n"; + DidPrintFunc = true; + } + Offset = BC.printInstructions(PrintBB ? dbgs() : nulls(), + BB.begin(), + BB.end(), + Offset, + &Function); + } + } + }); + + // If icp-top-callsites is enabled, compute the total number of indirect + // calls and then optimize the hottest callsites that contribute to that + // total. + if (opts::ICPTopCallsites > 0) { + using IndirectCallsite = std::pair; + std::vector IndirectCalls; + size_t TotalIndirectCalls = 0; + + // Find all the indirect callsites. + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + + if (!Function.isSimple() || + !opts::shouldProcess(Function) || + !Function.getBranchData()) + continue; + + const bool HasLayout = !Function.layout_empty(); + + for (auto &BB : Function) { + if (HasLayout && Function.isSplit() && BB.isCold()) + continue; + + for (auto &Inst : BB) { + if ((BC.MIA->isIndirectCall(Inst) && OptimizeCalls) || + (Function.getJumpTable(Inst) && OptimizeJumpTables)) { + IndirectCalls.push_back(std::make_pair(&BB, &Inst)); + TotalIndirectCalls += BB.getKnownExecutionCount(); + } + } + } + } + + // Sort callsites by execution count. + std::sort(IndirectCalls.begin(), + IndirectCalls.end(), + [](const IndirectCallsite &A, const IndirectCallsite &B) { + const auto CountA = A.first->getKnownExecutionCount(); + const auto CountB = B.first->getKnownExecutionCount(); + return CountA > CountB; + }); + + // Find callsites that contribute to the top "opts::ICPTopCallsites"% + // number of calls. + const float TopPerc = opts::ICPTopCallsites / 100.0f; + int64_t MaxCalls = TotalIndirectCalls * TopPerc; + size_t Num = 0; + for (auto &IC : IndirectCalls) { + if (MaxCalls <= 0) + break; + MaxCalls -= IC.first->getKnownExecutionCount(); + ++Num; + } + outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls + << ", " << Num << " calls cover " << opts::ICPTopCallsites << "% " + << "of all indirect calls\n"; + + // Mark sites to optimize with "DoICP" annotation. + for (size_t I = 0; I < Num; ++I) { + auto &Inst = *IndirectCalls[I].second; + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DoICP", true); + } + } + for (auto &BFIt : BFs) { auto &Function = BFIt.second; @@ -728,12 +1246,6 @@ void IndirectCallPromotion::runOnFunctions( const bool HasBranchData = Function.getBranchData() && BC.MIA->hasAnnotation(Inst, "Offset"); const bool IsJumpTable = Function.getJumpTable(Inst); - const bool OptimizeCalls = - (opts::IndirectCallPromotion == ICP_CALLS || - opts::IndirectCallPromotion == ICP_ALL); - const bool OptimizeJumpTables = - (opts::IndirectCallPromotion == ICP_JUMP_TABLES || - opts::IndirectCallPromotion == ICP_ALL); if (!((HasBranchData && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) @@ -750,7 +1262,7 @@ void IndirectCallPromotion::runOnFunctions( else ++TotalIndirectCallsites; - const auto Targets = getCallTargets(Function, Inst); + auto Targets = getCallTargets(Function, Inst); // Compute the total number of calls from this particular callsite. uint64_t NumCalls = 0; @@ -764,15 +1276,18 @@ void IndirectCallPromotion::runOnFunctions( // If FLAGS regs is alive after this jmp site, do not try // promoting because we will clobber FLAGS. - if (IsJumpTable && (*Info.getLivenessAnalysis().getStateBefore( - Inst))[BC.MIA->getFlagsReg()]) { - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: ICP failed in " << Function << " @ " - << InstIdx << " in " << BB->getName() - << ", calls = " << NumCalls - << ", cannot clobber flags reg.\n"; + if (IsJumpTable) { + auto State = Info.getLivenessAnalysis().getStateBefore(Inst); + if (!State || (State && (*State)[BC.MIA->getFlagsReg()])) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: ICP failed in " << Function << " @ " + << InstIdx << " in " << BB->getName() + << ", calls = " << NumCalls + << (State ? ", cannot clobber flags reg.\n" + : ", no liveness data available.\n"); + } + continue; } - continue; } // Should this callsite be optimized? Return the number of targets @@ -788,9 +1303,17 @@ void IndirectCallPromotion::runOnFunctions( } // Find MCSymbols or absolute addresses for each call target. - const auto SymTargets = findCallTargetSymbols(BC, Targets, N); + MCInst *TargetFetchInst = nullptr; + const auto SymTargets = findCallTargetSymbols(BC, + Targets, + N, + Function, + BB, + Inst, + TargetFetchInst); // If we can't resolve any of the target symbols, punt on this callsite. + // TODO: can this ever happen? if (SymTargets.size() < N) { const auto LastTarget = SymTargets.size(); if (opts::Verbosity >= 1) { @@ -803,12 +1326,33 @@ void IndirectCallPromotion::runOnFunctions( continue; } + MethodInfoType MethodInfo; + + if (!IsJumpTable) { + MethodInfo = maybeGetVtableAddrs(BC, + Function, + BB, + Inst, + SymTargets); + TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; + DEBUG(dbgs() << "BOLT-INFO: ICP " + << (!MethodInfo.first.empty() ? "found" : "did not find") + << " vtables for all methods.\n"); + } else if (TargetFetchInst) { + ++TotalIndexBasedJumps; + MethodInfo.second.push_back(TargetFetchInst); + } + // Generate new promoted call code for this callsite. auto ICPcode = (IsJumpTable && !opts::ICPJumpTablesByTarget) - ? BC.MIA->jumpTablePromotion(Inst, SymTargets, BC.Ctx.get()) + ? BC.MIA->jumpTablePromotion(Inst, + SymTargets, + MethodInfo.second, + BC.Ctx.get()) : BC.MIA->indirectCallPromotion( - Inst, SymTargets, opts::ICPOldCodeSequence, BC.Ctx.get()); + Inst, SymTargets, MethodInfo.first, MethodInfo.second, + opts::ICPOldCodeSequence, BC.Ctx.get()); if (ICPcode.empty()) { if (opts::Verbosity >= 1) { @@ -836,7 +1380,12 @@ void IndirectCallPromotion::runOnFunctions( }); // Rewrite the CFG with the newly generated ICP code. - auto NewBBs = rewriteCall(BC, Function, BB, Inst, std::move(ICPcode)); + auto NewBBs = rewriteCall(BC, + Function, + BB, + Inst, + std::move(ICPcode), + MethodInfo.second); // Fix the CFG after inserting the new basic blocks. auto MergeBlock = fixCFG(BC, Function, BB, IsTailCall, IsJumpTable, @@ -889,14 +1438,31 @@ void IndirectCallPromotion::runOnFunctions( << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / std::max(TotalIndirectCallsites, 1ul)) << "%\n" + << "BOLT-INFO: ICP number of method load elimination candidates = " + << TotalMethodLoadEliminationCandidates + << "\n" + << "BOLT-INFO: ICP percentage of method calls candidates that have " + "loads eliminated = " + << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / + std::max(TotalMethodLoadEliminationCandidates, 1ul)) + << "%\n" << "BOLT-INFO: ICP percentage of indirect branches that are " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentJmps) / std::max(TotalIndirectJmps, 1ul)) << "%\n" - << "BOLT-INFO: ICP percentage of jump table callsites that are optimized = " + << "BOLT-INFO: ICP percentage of jump table callsites that are " + << "optimized = " << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / std::max(TotalJumpTableCallsites, 1ul)) + << "%\n" + << "BOLT-INFO: ICP number of jump table callsites that can use hot " + << "indices = " << TotalIndexBasedCandidates + << "\n" + << "BOLT-INFO: ICP percentage of jump table callsites that use hot " + "indices = " + << format("%.1f", (100.0 * TotalIndexBasedJumps) / + std::max(TotalIndexBasedCandidates, 1ul)) << "%\n"; } diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/Passes/IndirectCallPromotion.h index 43dc6183f00d..cd49933fbe30 100644 --- a/bolt/Passes/IndirectCallPromotion.h +++ b/bolt/Passes/IndirectCallPromotion.h @@ -99,6 +99,9 @@ namespace bolt { /// class IndirectCallPromotion : public BinaryFunctionPass { using BasicBlocksVector = std::vector>; + using MethodInfoType = std::pair, std::vector>; + using JumpTableInfoType = std::vector>; + using SymTargetsType = std::vector>; struct Location { bool IsSymbol{false}; MCSymbol *Sym{nullptr}; @@ -153,6 +156,12 @@ class IndirectCallPromotion : public BinaryFunctionPass { // (a fraction of TotalIndirectCallsites) uint64_t TotalOptimizedIndirectCallsites{0}; + // Total number of method callsites that can have loads eliminated. + mutable uint64_t TotalMethodLoadEliminationCandidates{0}; + + // Total number of method callsites that had loads eliminated. + uint64_t TotalMethodLoadsEliminated{0}; + // Total number of jump table callsites that are optimized by ICP. uint64_t TotalOptimizedJumpTableCallsites{0}; @@ -164,6 +173,12 @@ class IndirectCallPromotion : public BinaryFunctionPass { // (a fraction of TotalCalls) uint64_t TotalNumFrequentJmps{0}; + // Total number of jump table sites that can use hot indices. + mutable uint64_t TotalIndexBasedCandidates{0}; + + // Total number of jump table sites that use hot indices. + uint64_t TotalIndexBasedJumps{0}; + std::vector getCallTargets(BinaryFunction &BF, const MCInst &Inst) const; @@ -178,17 +193,36 @@ class IndirectCallPromotion : public BinaryFunctionPass { const size_t N, uint64_t NumCalls) const; - std::vector> - findCallTargetSymbols(BinaryContext &BC, - const std::vector &Targets, - const size_t N) const; + JumpTableInfoType + maybeGetHotJumpTableTargets(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + MCInst *&TargetFetchInst, + const BinaryFunction::JumpTable *JT, + const std::vector &Targets) const; + + SymTargetsType findCallTargetSymbols(BinaryContext &BC, + std::vector &Targets, + const size_t N, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + MCInst *&TargetFetchInst) const; + + MethodInfoType maybeGetVtableAddrs(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + const SymTargetsType &SymTargets) const; std::vector> rewriteCall(BinaryContext &BC, BinaryFunction &Function, BinaryBasicBlock *IndCallBlock, const MCInst &CallInst, - MCInstrAnalysis::ICPdata &&ICPcode) const; + MCInstrAnalysis::ICPdata &&ICPcode, + const std::vector &MethodFetchInsns) const; BinaryBasicBlock *fixCFG(BinaryContext &BC, BinaryFunction &Function, diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 8d67ac160443..fd6c148702da 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -319,8 +319,12 @@ IgnoreBuildID("ignore-build-id", // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { - if (opts::MaxFunctions && Function.getFunctionNumber() > opts::MaxFunctions) - return false; + if (opts::MaxFunctions && Function.getFunctionNumber() >= opts::MaxFunctions) { + if (Function.getFunctionNumber() == opts::MaxFunctions) + dbgs() << "BOLT-INFO: processing ending on " << Function << "\n"; + else + return false; + } auto populateFunctionNames = [](cl::opt &FunctionNamesFile, cl::list &FunctionNames) { @@ -400,21 +404,22 @@ const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; namespace llvm { namespace bolt { extern const char *BoltRevision; -} -} -static void report_error(StringRef Message, std::error_code EC) { +void report_error(StringRef Message, std::error_code EC) { assert(EC); errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n"; exit(1); } -static void check_error(std::error_code EC, StringRef Message) { +void check_error(std::error_code EC, StringRef Message) { if (!EC) return; report_error(Message, EC); } +} +} + uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, unsigned Alignment, unsigned SectionID, @@ -1900,12 +1905,15 @@ void RewriteInstance::readProfileData() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - auto *FuncData = BC->DR.getFuncBranchData(Function.getNames()); - if (!FuncData) - continue; - Function.BranchData = FuncData; - Function.ExecutionCount = FuncData->ExecutionCount; - FuncData->Used = true; + if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { + Function.MemData = MemData; + MemData->Used = true; + } + if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { + Function.BranchData = FuncData; + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } } } @@ -1923,12 +1931,9 @@ void RewriteInstance::disassembleFunctions() { continue; } - SectionRef Section = Function.getSection(); - assert(Section.getAddress() <= Function.getAddress() && - Section.getAddress() + Section.getSize() - >= Function.getAddress() + Function.getSize() && - "wrong section for function"); - if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + auto FunctionData = BC->getFunctionData(Function); + + if (!FunctionData) { // When could it happen? errs() << "BOLT-ERROR: corresponding section is non-executable or " << "empty for function " << Function << '\n'; @@ -1941,26 +1946,12 @@ void RewriteInstance::disassembleFunctions() { continue; } - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - - assert(SectionContents.size() == Section.getSize() && - "section size mismatch"); - - // Function offset from the section start. - auto FunctionOffset = Function.getAddress() - Section.getAddress(); - // Offset of the function in the file. - Function.setFileOffset( - SectionContents.data() - InputFile->getData().data() + FunctionOffset); - - ArrayRef FunctionData( - reinterpret_cast - (SectionContents.data()) + FunctionOffset, - Function.getSize()); + auto *FileBegin = + reinterpret_cast(InputFile->getData().data()); + Function.setFileOffset(FunctionData->begin() - FileBegin); - Function.disassemble(FunctionData); + Function.disassemble(*FunctionData); if (!Function.isSimple() && opts::Relocs) { errs() << "BOLT-ERROR: function " << Function << " cannot be properly "