From 4c60993406bb21af421c96e3ea5c53e58464981e Mon Sep 17 00:00:00 2001 From: Akira Saitoh Date: Thu, 7 Sep 2023 11:51:00 +0900 Subject: [PATCH] AArch64: Use lastITable cache for interface call dispatching This commit extends the existing interface call dispatching code of AArch64 codegen to exploit cached lastITable field in J9Class. buildInterfaceCall function is updated to generate The code for lastITable cache lookup after the code for checking the second PIC slot. Until the second slot is populated, lastITable cache lookup always fails to avoid filling PIC slots with infrequent call targets. See OpenJ9 #8390 for more details. After the second slot is filled, the lastITable cache lookup is enabled by patching the b.al (branch always) instruction in the lookup code to b.ne (branch if not equal). Recompilation code is also updated to properly handle the new instruction sequence generated by updated buildInterfaceCall function. Resolves https://github.com/eclipse-openj9/openj9/issues/8400 Signed-off-by: Akira Saitoh --- .../aarch64/codegen/ARM64PrivateLinkage.cpp | 346 +++++++++++++++--- .../compiler/aarch64/codegen/CallSnippet.cpp | 1 + .../compiler/aarch64/codegen/CallSnippet.hpp | 14 +- .../compiler/aarch64/runtime/PicBuilder.spp | 31 +- runtime/compiler/runtime/Trampoline.cpp | 127 +++++-- 5 files changed, 417 insertions(+), 102 deletions(-) diff --git a/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp b/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp index 5d2a02dd290..f19e95eb9d2 100644 --- a/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp +++ b/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp @@ -1665,51 +1665,141 @@ static void buildVirtualCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Regi /** * @brief Generates instruction sequence for interface call * - * @param[in] cg: code generator - * @param[in] callNode: node for the interface call - * @param[in] vftReg: vft register - * @param[in] tmpReg: temporary register - * @param[in] tmp2Reg: temporary register - * @param[in] ifcSnippet: interface call snippet - * @param[in] regMapForGC: register map for GC + * @param[in] cg: code generator + * @param[in] callNode: node for the interface call + * @param[in] vftReg: vft register + * @param[in] x9Reg: temporary register + * @param[in] x10Reg: temporary register + * @param[in] x11Reg: temporary register + * @param[in] useLastITableCache: if true, use last iTable cache + * @param[in] ifcSnippet: interface call snippet + * @param[in] regMapForGC: register map for GC */ -static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Register *vftReg, TR::Register *tmpReg, TR::Register *tmp2Reg, TR::ARM64InterfaceCallSnippet *ifcSnippet, uint32_t regMapForGC) +static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Register *vftReg, TR::Register *x9Reg, TR::Register *x10Reg, TR::Register *x11Reg, + bool useLastITableCache, TR::ARM64InterfaceCallSnippet *ifcSnippet, uint32_t regMapForGC) { /* * Generating following instruction sequence. * Recompilation is dependent on this instruction sequence. * Please do not modify without changing recompilation code. * - * ldrx tmpReg, L_firstClassCacheSlot - * cmpx vftReg, tmpReg - * ldrx tmpReg, L_firstBranchAddressCacheSlot - * beq hitLabel - * ldrx tmpReg, L_secondClassCacheSlot - * cmpx vftReg, tmpReg - * bne snippetLabel - * ldrx tmpReg, L_secondBranchAddressCacheSlot - * hitLabel: - * blr tmpReg - * doneLabel: + * if useLastITableCache is false: + * if debug counters are disabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * beq hitLabel + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * bne snippetLabel + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * hitLabel: + * blr x10Reg + * doneLabel: + * + * if debug counters are enabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot1MissedLabel + * ; debug counter, trashes x9 and x11Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * b hitLabel + * slot1MissedLabel: + * beq hitLabel + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot2MissedLabel + * ; debug counter, trashes x9 and x11Reg + * b slot2DoneLabel + * slot2MissedLabel: + * ; debug counter, trashes x9 and x11Reg + * slot2DoneLabel: + * cmpx vftReg, x10Reg + * bne snippetLabel + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * hitLabel: + * blr x10Reg + * doneLabel: + * + * if useLastITableCache is true: + * if debug counters are disabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * beq hitLabel + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * beq hitLabel + * ldr x10Reg, [vftReg, lastITableOffset] ; cached iTable + * ldrx x9, L_interfaceClassSlot ; actual interfaceClass + * ldr x11Reg, [x10Reg, interfaceClassOffset]; interfaceClass in lastITable + * cmpx x9, x11Reg + * bal snippetLabel ; will be patched to bne + * mov w9, sizeof(J9Class) + * ldr x11Reg, [x10Reg, iTableOffset] ; load vTableOffset + * sub x9, x9, x11Reg ; icallVMprJavaSendPatchupVirtual expects x9 to hold vTable index + * ldr x10Reg, [vftReg, x9] + * hitLabel: + * blr x10Reg + * doneLabel: + * + * if debug counters are enabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot1MissedLabel + * ; debug counter, trashes x9 and x11Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * b hitLabel + * slot1MissedLabel: + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot2DoneLabel + * ; debug counter, trashes x9 and x11Reg + * slot2DoneLabel: + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * beq hitLabel + * ldr x10Reg, [vftReg, lastITableOffset] ; cached iTable + * ldrx x9, L_interfaceClassSlot ; actual interfaceClass + * ldr x11Reg, [x10Reg, interfaceClassOffset]; interfaceClass in lastITable + * cmpx x9, x11Reg + * bne lastITableMissedLabel + * ; debug counter, trashes x9 and x11Reg + * cmpx x9, x9 ; to set Z flag + * b lastITableDoneLabel + * lastITableMissedLabel: + * ; debug counter, trashes x9 and x11Reg + * cmp x10Reg, #0 ; to unset Z flag + * lastITableDoneLabel: + * bal snippetLabel ; will be patched to bne + * mov w9, sizeof(J9Class) + * ldr x11Reg, [x10Reg, iTableOffset] ; load vTableOffset + * sub x9, x9, x11Reg ; icallVMprJavaSendPatchupVirtual expects x9 to hold vTable index + * ldr x10Reg, [vftReg, x9] + * hitLabel: + * blr x10Reg + * doneLabel: */ TR::LabelSymbol *ifcSnippetLabel = ifcSnippet->getSnippetLabel(); TR::LabelSymbol *firstClassCacheSlotLabel = ifcSnippet->getFirstClassCacheSlotLabel(); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, firstClassCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, firstClassCacheSlotLabel); TR::LabelSymbol *hitLabel = generateLabelSymbol(cg); - generateCompareInstruction(cg, callNode, vftReg, tmpReg, true); + generateCompareInstruction(cg, callNode, vftReg, x10Reg, true); TR::LabelSymbol *firstBranchAddressCacheSlotLabel = ifcSnippet->getFirstBranchAddressCacheSlotLabel(); TR::Compilation *comp = cg->comp(); TR_Debug *debugObj = cg->getDebug(); TR_ARM64ScratchRegisterManager *srm = NULL; bool isDebugCounterGenerated = false; - if (comp->getOptions()->enableDebugCounters()) + const bool enableDebugCounters = comp->getOptions()->enableDebugCounters(); + if (enableDebugCounters) { srm = cg->generateScratchRegisterManager(2); - srm->donateScratchRegister(tmpReg); - srm->donateScratchRegister(tmp2Reg); + srm->donateScratchRegister(x9Reg); + srm->donateScratchRegister(x11Reg); TR::Instruction *prevCursor = cg->getAppendInstruction(); /* Record if slot 1 hit */ TR::Instruction *cursor = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/slot1", @@ -1723,7 +1813,7 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re /* Debug counter was generated. Generating instructions before debug counter instructions. */ TR::LabelSymbol *slot1MissedLabel = generateLabelSymbol(cg); TR::Instruction *branchToSlot1MissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot1MissedLabel, TR::CC_NE, prevCursor); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, firstBranchAddressCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, firstBranchAddressCacheSlotLabel); TR::Instruction *branchToHitLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, hitLabel); TR::Instruction *slot1MissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot1MissedLabel); if (debugObj) @@ -1736,7 +1826,7 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re } if (!isDebugCounterGenerated) { - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, firstBranchAddressCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, firstBranchAddressCacheSlotLabel); TR::Instruction *branchToHitLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, hitLabel, TR::CC_EQ); if (debugObj) { @@ -1746,12 +1836,10 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re TR::LabelSymbol *secondClassCacheSlotLabel = ifcSnippet->getSecondClassCacheSlotLabel(); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, secondClassCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, secondClassCacheSlotLabel); - if (comp->getOptions()->enableDebugCounters()) + if (enableDebugCounters) { - TR::LabelSymbol *slot2MissedLabel = generateLabelSymbol(cg); - TR::LabelSymbol *slot2DoneLabel = generateLabelSymbol(cg); TR::Instruction *prevCursor1 = cg->getAppendInstruction(); /* Record if slot 2 hit */ TR::Instruction *cursor1 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/slot2", @@ -1759,51 +1847,158 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re comp->getHotnessName(), callNode->getByteCodeInfo().getCallerIndex(), callNode->getByteCodeInfo().getByteCodeIndex()), *srm); - TR::Instruction *prevCursor2 = cg->getAppendInstruction(); - /* Record if slot 2 missed */ - TR::Instruction *cursor2 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/cachemiss", + + if (!useLastITableCache) + { + TR::Instruction *prevCursor2 = cg->getAppendInstruction(); + /* Record if slot 2 missed */ + TR::Instruction *cursor2 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/cachemiss", comp->signature(), comp->getHotnessName(), callNode->getByteCodeInfo().getCallerIndex(), callNode->getByteCodeInfo().getByteCodeIndex()), *srm); - if ((prevCursor1 != cursor1) || (prevCursor2 != cursor2)) + if ((prevCursor1 != cursor1) || (prevCursor2 != cursor2)) + { + TR::LabelSymbol *slot2MissedLabel = generateLabelSymbol(cg); + TR::LabelSymbol *slot2DoneLabel = generateLabelSymbol(cg); + /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for second cache slot. */ + TR::Instruction *cursor = generateCompareInstruction(cg, callNode, vftReg, x10Reg, true, prevCursor1); + TR::Instruction *branchToSlot2MissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot2MissedLabel, TR::CC_NE, cursor); + + /* Generating instructions before debug counter instructions recording cache miss. */ + cursor = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, slot2DoneLabel, prevCursor2); + TR::Instruction *slot2MissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2MissedLabel, cursor); + + /* Generating instructions after debug counter instructions. */ + TR::Instruction *slot2DoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2DoneLabel); + if (debugObj) + { + debugObj->addInstructionComment(branchToSlot2MissedLabelInstr, "Jumps to slot2MissedLabel"); + debugObj->addInstructionComment(cursor, "Jumps to slot2DoneLabel"); + debugObj->addInstructionComment(slot2MissedLabelInstr, "slot2MissedLabel"); + debugObj->addInstructionComment(slot2DoneLabelInstr, "slot2DoneLabel"); + } + } + } + else { - /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for second cache slot. */ - TR::Instruction *cursor = generateCompareInstruction(cg, callNode, vftReg, tmpReg, true, prevCursor1); - TR::Instruction *branchToSlot2MissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot2MissedLabel, TR::CC_NE, cursor); + if (prevCursor1 != cursor1) + { + TR::LabelSymbol *slot2DoneLabel = generateLabelSymbol(cg); + /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for second cache slot. */ + TR::Instruction *cursor = generateCompareInstruction(cg, callNode, vftReg, x10Reg, true, prevCursor1); + TR::Instruction *branchToSlot2DoneLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot2DoneLabel, TR::CC_NE, cursor); + /* Generating instructions after debug counter instructions. */ + TR::Instruction *slot2DoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2DoneLabel); + if (debugObj) + { + debugObj->addInstructionComment(branchToSlot2DoneLabelInstr, "Jumps to slot2DoneLabel"); + debugObj->addInstructionComment(slot2DoneLabelInstr, "slot2DoneLabel"); + } + } + } + } - /* Generating instructions before debug counter instructions recording cache miss. */ - cursor = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, slot2DoneLabel, prevCursor2); - TR::Instruction *slot2MissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2MissedLabel, cursor); + generateCompareInstruction(cg, callNode, vftReg, x10Reg, true); - /* Generating instructions after debug counter instructions. */ - TR::Instruction *slot2DoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2DoneLabel); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, secondClassCacheSlotLabel); - if (debugObj) + TR::SymbolReference *methodSymRef = callNode->getSymbolReference(); + TR_ResolvedMethod *owningMethod = methodSymRef->getOwningMethod(comp); + uintptr_t itableIndex; + TR_OpaqueClassBlock *interfaceClassOfMethod = owningMethod->getResolvedInterfaceMethod(methodSymRef->getCPIndex(), &itableIndex); + + TR::Instruction *gcPoint; + if (useLastITableCache && (interfaceClassOfMethod != NULL)) + { + TR_J9VMBase *fej9 = cg->fej9(); + + TR::LabelSymbol *secondBranchAddressCacheSlotLabel = ifcSnippet->getSecondBranchAddressCacheSlotLabel(); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, secondBranchAddressCacheSlotLabel); + generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, hitLabel, TR::CC_EQ); + + generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, callNode, x10Reg, TR::MemoryReference::createWithDisplacement(cg, vftReg, fej9->getOffsetOfLastITableFromClassField())); + TR::LabelSymbol *interfacedClassSlotLabel = ifcSnippet->getInterfaceClassSlotLabel(); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x9Reg, 0, interfacedClassSlotLabel); + generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, callNode, x11Reg, TR::MemoryReference::createWithDisplacement(cg, x10Reg, fej9->getOffsetOfInterfaceClassFromITableField())); + generateCompareInstruction(cg, callNode, x9Reg, x11Reg, true); + + if (enableDebugCounters) + { + TR::Instruction *prevCursor1 = cg->getAppendInstruction(); + /* Record if lastITable cache hit */ + TR::Instruction *cursor1 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/lastITable", + comp->signature(), + comp->getHotnessName(), + callNode->getByteCodeInfo().getCallerIndex(), + callNode->getByteCodeInfo().getByteCodeIndex()), *srm); + TR::Instruction *prevCursor2 = cg->getAppendInstruction(); + /* Record if lastITable cache missed */ + TR::Instruction *cursor2 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/cachemiss", + comp->signature(), + comp->getHotnessName(), + callNode->getByteCodeInfo().getCallerIndex(), + callNode->getByteCodeInfo().getByteCodeIndex()), *srm); + if ((prevCursor1 != cursor1) || (prevCursor2 != cursor2)) { - debugObj->addInstructionComment(branchToSlot2MissedLabelInstr, "Jumps to slot2MissedLabel"); - debugObj->addInstructionComment(cursor, "Jumps to slot2DoneLabel"); - debugObj->addInstructionComment(slot2MissedLabelInstr, "slot2MissedLabel"); - debugObj->addInstructionComment(slot2DoneLabelInstr, "slot2DoneLabel"); + TR::LabelSymbol *lastITableMissedLabel = generateLabelSymbol(cg); + TR::LabelSymbol *lastITableDoneLabel = generateLabelSymbol(cg); + /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for lastITable cache. */ + TR::Instruction *branchToLastITableMissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, lastITableMissedLabel, TR::CC_NE, prevCursor1); + + /* Generating instructions before debug counter instructions recording cache miss. */ + TR::Instruction *cmpInstr1 = generateCompareInstruction(cg, callNode, x9Reg, x9Reg, true, prevCursor2); /* to set Z flag */ + TR::Instruction *cursor = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, lastITableDoneLabel, cmpInstr1); + TR::Instruction *lastITableMissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, lastITableMissedLabel, cursor); + + /* Generating instructions after debug counter instructions. */ + generateCompareImmInstruction(cg, callNode, x10Reg, 0, true); /* to unset Z flag */ + TR::Instruction *lastITableDoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, lastITableDoneLabel); + if (debugObj) + { + debugObj->addInstructionComment(branchToLastITableMissedLabelInstr, "Jumps to lastITableMissedLabel"); + debugObj->addInstructionComment(cursor, "Jumps to lastITableDoneLabel"); + debugObj->addInstructionComment(lastITableMissedLabelInstr, "lastITable2MissedLabel"); + debugObj->addInstructionComment(lastITableDoneLabelInstr, "lastITableDoneLabel"); + } } } - srm->stopUsingRegisters(); + + /* This conditional branch instruction with "always" condition code will be patched to b.ne instruction after second cache slot is filled. */ + gcPoint = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, ifcSnippetLabel, TR::CC_AL); + loadConstant32(cg, callNode, fej9->getITableEntryJitVTableOffset(), x9Reg); + generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, callNode, x11Reg, TR::MemoryReference::createWithDisplacement(cg, x10Reg, fej9->convertITableIndexToOffset(itableIndex))); + /* PicBuilder.spp checks this instruction. It needs to be 'sub x9, x9, x11'. */ + generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, callNode, x9Reg, x9Reg, x11Reg); + generateTrg1MemInstruction(cg, TR::InstOpCode::ldroffx, callNode, x10Reg, TR::MemoryReference::createWithIndexReg(cg, vftReg, x9Reg)); + if (debugObj) + { + debugObj->addInstructionComment(gcPoint, "Jumps to Snippet. Will be patched to b.ne"); + } } + else + { + gcPoint = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, ifcSnippetLabel, TR::CC_NE); + TR::LabelSymbol *secondBranchAddressCacheSlotLabel = ifcSnippet->getSecondBranchAddressCacheSlotLabel(); - generateCompareInstruction(cg, callNode, vftReg, tmpReg, true); - TR::Instruction *gcPoint = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, ifcSnippetLabel, TR::CC_NE); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, secondBranchAddressCacheSlotLabel); + if (debugObj) + { + debugObj->addInstructionComment(gcPoint, "Jumps to snippet"); + } + } gcPoint->ARM64NeedsGCMap(cg, regMapForGC); - TR::LabelSymbol *secondBranchAddressCacheSlotLabel = ifcSnippet->getSecondBranchAddressCacheSlotLabel(); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, secondBranchAddressCacheSlotLabel); + if (enableDebugCounters) + { + srm->stopUsingRegisters(); + } TR::Instruction *hitLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, hitLabel); if (debugObj) { - debugObj->addInstructionComment(gcPoint, "Jumps to snippet"); debugObj->addInstructionComment(hitLabelInstr, "hitLabel"); } - gcPoint = generateRegBranchInstruction(cg, TR::InstOpCode::blr, callNode, tmpReg); + gcPoint = generateRegBranchInstruction(cg, TR::InstOpCode::blr, callNode, x10Reg); gcPoint->ARM64NeedsGCMap(cg, regMapForGC); + } static TR::Register *evaluateUpToVftChild(TR::Node *callNode, TR::CodeGenerator *cg) @@ -1826,6 +2021,7 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, TR::Register *x0 = dependencies->searchPreConditionRegister(TR::RealRegister::x0); TR::Register *x9 = dependencies->searchPreConditionRegister(TR::RealRegister::x9); TR::Register *x10 = dependencies->searchPreConditionRegister(TR::RealRegister::x10); + TR::Register *x11 = dependencies->searchPreConditionRegister(TR::RealRegister::x11); TR::SymbolReference *methodSymRef = callNode->getSymbolReference(); TR::MethodSymbol *methodSymbol = methodSymRef->getSymbol()->castToMethodSymbol(); @@ -2045,6 +2241,8 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, } } + bool useLastITableCache = !comp()->getOption(TR_DisableLastITableCache); + // Profile-driven virtual and interface calls // // If the top value dominates everything else, generate a single static @@ -2066,6 +2264,32 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, ListIterator i(&values); J9::ARM64PICItem *pic = i.getFirst(); + if (useLastITableCache && methodSymbol->isInterface()) + { + // Find the class pointer to the interface class if it is already loaded. + // + TR::Method *interfaceMethod = methodSymbol->getMethod(); + int32_t len = interfaceMethod->classNameLength(); + char *s = TR::Compiler->cls.classNameToSignature(interfaceMethod->classNameChars(), len, comp()); + auto interfaceClassOfMethod = fej9->getClassFromSignature(s, len, methodSymRef->getOwningMethod(comp())); + int32_t numStaticPICSlots = (pic->_frequency > MAX_PROFILED_CALL_FREQUENCY) ? 1 : values.getSize(); + + // Disable lastITable logic if all the implementers can fit into the pic slots during non-startup state + if (interfaceClassOfMethod && comp()->getPersistentInfo()->getJitState() != STARTUP_STATE) + { + int32_t numPICSlots = numStaticPICSlots + 2; + TR_ResolvedMethod **implArray = new (comp()->trStackMemory()) TR_ResolvedMethod *[numPICSlots+1]; + TR_PersistentCHTable *chTable = comp()->getPersistentInfo()->getPersistentCHTable(); + int32_t cpIndex = methodSymRef->getCPIndex(); + int32_t numImplementers = chTable->findnInterfaceImplementers(interfaceClassOfMethod, numPICSlots+1, implArray, cpIndex, methodSymRef->getOwningMethod(comp()), comp()); + if (numImplementers <= numPICSlots) + { + useLastITableCache = false; + if (comp()->getOption(TR_TraceCG)) + traceMsg(comp(),"Found %d implementers for call to %s, can be fit into %d pic slots, disabling lastITable cache\n", numImplementers, methodSymbol->getMethod()->signature(comp()->trMemory()), numPICSlots); + } + } + } // If this value is dominant, optimize exclusively for it if (pic->_frequency > MAX_PROFILED_CALL_FREQUENCY) { @@ -2100,11 +2324,13 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, TR::LabelSymbol *firstBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondClassCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); + TR::LabelSymbol *interfaceClassSlotLabel = generateLabelSymbol(cg()); TR::ARM64InterfaceCallSnippet *ifcSnippet = new (trHeapMemory()) TR::ARM64InterfaceCallSnippet(cg(), callNode, ifcSnippetLabel, argSize, doneOOLLabel, firstClassCacheSlotLabel, secondClassCacheSlotLabel, - firstBranchAddressCacheSlotLabel, secondBranchAddressCacheSlotLabel, static_cast(thunk)); + firstBranchAddressCacheSlotLabel, secondBranchAddressCacheSlotLabel, + interfaceClassSlotLabel, static_cast(thunk)); cg()->addSnippet(ifcSnippet); - buildInterfaceCall(cg(), callNode, vftReg, x9, x10, ifcSnippet, regMapForGC); + buildInterfaceCall(cg(), callNode, vftReg, x9, x10, x11, useLastITableCache, ifcSnippet, regMapForGC); } else { @@ -2153,12 +2379,14 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, TR::LabelSymbol *firstBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondClassCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); + TR::LabelSymbol *interfaceClassSlotLabel = generateLabelSymbol(cg()); TR::ARM64InterfaceCallSnippet *ifcSnippet = new (trHeapMemory()) - TR::ARM64InterfaceCallSnippet(cg(), callNode, ifcSnippetLabel, argSize, doneLabel, firstClassCacheSlotLabel, firstBranchAddressCacheSlotLabel, secondClassCacheSlotLabel, secondBranchAddressCacheSlotLabel, static_cast(thunk)); + TR::ARM64InterfaceCallSnippet(cg(), callNode, ifcSnippetLabel, argSize, doneLabel, firstClassCacheSlotLabel, firstBranchAddressCacheSlotLabel, + secondClassCacheSlotLabel, secondBranchAddressCacheSlotLabel, interfaceClassSlotLabel, static_cast(thunk)); cg()->addSnippet(ifcSnippet); - buildInterfaceCall(cg(), callNode, vftReg, x9, x10, ifcSnippet, regMapForGC); + buildInterfaceCall(cg(), callNode, vftReg, x9, x10, x11, useLastITableCache, ifcSnippet, regMapForGC); } else { diff --git a/runtime/compiler/aarch64/codegen/CallSnippet.cpp b/runtime/compiler/aarch64/codegen/CallSnippet.cpp index 0c780ad63c6..0d80e7dc854 100644 --- a/runtime/compiler/aarch64/codegen/CallSnippet.cpp +++ b/runtime/compiler/aarch64/codegen/CallSnippet.cpp @@ -811,6 +811,7 @@ uint8_t *TR::ARM64InterfaceCallSnippet::emitSnippetBody() cursor += sizeof(intptr_t); // 2 slots for resolved values (interface class and iTable index) + _interfaceClassSlotLabel->setCodeLocation(cursor); *reinterpret_cast(cursor) = 0; cursor += sizeof(intptr_t); *reinterpret_cast(cursor) = 0; diff --git a/runtime/compiler/aarch64/codegen/CallSnippet.hpp b/runtime/compiler/aarch64/codegen/CallSnippet.hpp index 33e3ee96d94..776e60a435b 100644 --- a/runtime/compiler/aarch64/codegen/CallSnippet.hpp +++ b/runtime/compiler/aarch64/codegen/CallSnippet.hpp @@ -134,23 +134,28 @@ class ARM64InterfaceCallSnippet : public TR::ARM64VirtualSnippet TR::LabelSymbol *_firstBranchAddressCacheSlotLabel; TR::LabelSymbol *_secondClassCacheSlotLabel; TR::LabelSymbol *_secondBranchAddressCacheSlotLabel; + TR::LabelSymbol *_interfaceClassSlotLabel; public: ARM64InterfaceCallSnippet(TR::CodeGenerator *cg, TR::Node *c, TR::LabelSymbol *lab, int32_t s, TR::LabelSymbol *retl, TR::LabelSymbol *firstClassCacheSlotLabel, TR::LabelSymbol *firstBranchAddressCacheSlotLabel, - TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel) + TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel, + TR::LabelSymbol *interfaceClassSlotLabel) : TR::ARM64VirtualSnippet(cg, c, lab, s, retl), thunkAddress(NULL), _firstClassCacheSlotLabel(firstClassCacheSlotLabel), _firstBranchAddressCacheSlotLabel(firstBranchAddressCacheSlotLabel), - _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel) + _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel), + _interfaceClassSlotLabel(interfaceClassSlotLabel) { } ARM64InterfaceCallSnippet(TR::CodeGenerator *cg, TR::Node *c, TR::LabelSymbol *lab, int32_t s, TR::LabelSymbol *retl, TR::LabelSymbol *firstClassCacheSlotLabel, TR::LabelSymbol *firstBranchAddressCacheSlotLabel, - TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel, uint8_t *thunkPtr) + TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel, + TR::LabelSymbol *interfaceClassSlotLabel, uint8_t *thunkPtr) : TR::ARM64VirtualSnippet(cg, c, lab, s, retl), thunkAddress(thunkPtr), _firstClassCacheSlotLabel(firstClassCacheSlotLabel), _firstBranchAddressCacheSlotLabel(firstBranchAddressCacheSlotLabel), - _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel) + _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel), + _interfaceClassSlotLabel(interfaceClassSlotLabel) { } @@ -158,6 +163,7 @@ class ARM64InterfaceCallSnippet : public TR::ARM64VirtualSnippet TR::LabelSymbol *getFirstBranchAddressCacheSlotLabel() { return _firstBranchAddressCacheSlotLabel; } TR::LabelSymbol *getSecondClassCacheSlotLabel() { return _secondClassCacheSlotLabel; } TR::LabelSymbol *getSecondBranchAddressCacheSlotLabel() { return _secondBranchAddressCacheSlotLabel; } + TR::LabelSymbol *getInterfaceClassSlotLabel() { return _interfaceClassSlotLabel; } virtual Kind getKind() { return IsInterfaceCall; } virtual uint8_t *emitSnippetBody(); diff --git a/runtime/compiler/aarch64/runtime/PicBuilder.spp b/runtime/compiler/aarch64/runtime/PicBuilder.spp index 605e5f4c027..eca4d36150f 100644 --- a/runtime/compiler/aarch64/runtime/PicBuilder.spp +++ b/runtime/compiler/aarch64/runtime/PicBuilder.spp @@ -119,6 +119,9 @@ .set J9TR_ICSnippet_SecondTarget, 64 .set J9TR_ICSnippet_J2IThunk, 72 + .set J9TR_ICCodeCacheRA_Sub, -12 + .set J9TR_ICCodeCacheRA_BCond, -24 + // Unresolved virtual call snippet .set J9TR_UVCSnippet_codeCacheReturnAddress, 0 @@ -1033,11 +1036,37 @@ L_tryToCompleteSlot2: ldr x3, [x1, J9TR_J9Class_classLoader] ldr x4, [x2, J9TR_J9Class_classLoader] cmp x3, x4 // Same classLoader? - beq L_commonJitDispatch + beq L_patchBranchInstIfLastITableCacheIsUsed // Skip pic registration mov x6, #J9TR_ICSnippet_SecondClass // slot2 class offset bl L_picRegistration + +L_patchBranchInstIfLastITableCacheIsUsed: + ldr w1, [x10, J9TR_ICCodeCacheRA_Sub] // Load instruction at 12 bytes before code cache RA + ldr w2, const_InstructionLastITableCache + cmp w1, w2 + bne L_commonJitDispatch // If lastITableCache check is not generated, goto L_commonJitDispatch + ldr w1, [x10, J9TR_ICCodeCacheRA_BCond] // Load instruction at 24 bytes before code cache RA + lsr w2, w1, #24 + cmp w2, 0x54 // Test if the bit 24-31 is 0x54 (b.cond) + and w2, w1, #0x1f + ccmp w2, #0xe, #1, eq // Test if condition code is AL and bit 4 is 0 + bne L_commonJitDispatch + + bfxil w1, wzr, #0, #4 // Clear condition code + orr w1, w1, #1 // Set condition code to NE + START_MODIFYING_CODE + str w1, [x10, J9TR_ICCodeCacheRA_BCond] // Update b.al to b.ne + FINISH_MODIFYING_CODE + mov x8, x0 // Preserve x0 (vtable offset) in x8 + add x0, x10, #J9TR_ICCodeCacheRA_BCond // Address of B.cond instruction + mov x1, #4 // 1 instruction to flush + bl flushICache + mov x0, x8 b L_commonJitDispatch +const_InstructionLastITableCache: + sub x9, x9, x11 // The instruction to be compared + L_exitTryToCompleteSlot2: FINISH_MODIFYING_CODE b L_commonJitDispatch diff --git a/runtime/compiler/runtime/Trampoline.cpp b/runtime/compiler/runtime/Trampoline.cpp index b04b1d4b746..ac409eb173f 100644 --- a/runtime/compiler/runtime/Trampoline.cpp +++ b/runtime/compiler/runtime/Trampoline.cpp @@ -1006,59 +1006,110 @@ static bool isInterfaceCallSite(uint8_t *callSite, intptr_t& addrOfFirstClassSlo { /* * Following instruction sequence is used for interface call. - * We can assume tmpReg is x9. + * We can assume tmpReg is x10. * Searching for the last 4 instructions * - * ldrx tmpReg, L_firstClassCacheSlot - * cmpx vftReg, tmpReg - * ldrx tmpReg, L_firstBranchAddressCacheSlot - * beq hitLabel - * ldrx tmpReg, L_secondClassCacheSlot - * cmpx vftReg, tmpReg - * bne snippetLabel - * ldrx tmpReg, L_secondBranchAddressCacheSlot - * hitLabel: - * blr tmpReg - * doneLabel: + * If the instruction before `blr tmpReg` is `ldr tmpReg, label`, + * then the lastITable cache is not used. + * In that case, we expect the below instructions before the call site. + * We obtain the address of the secondBranchAddressCacheSlot from `ldrx` instruction. + * + * cmpx vftReg, tmpReg + * bne snippetLabel + * ldrx tmpReg, L_secondBranchAddressCacheSlot + * hitLabel: + * blr tmpReg + * doneLabel: + * + * If the instruction before `blr tmpReg` is `ldr tmpReg, [vftReg, x9]`, + * then the lastITable cache is used. + * In that case, we expect the below instructions before the call site. + * We get the address of the interface call snippet from `bal snippetLabel` instruction. + * + * bal snippetLabel ; probably already patched to bne + * mov w9, sizeof(J9Class) + * ldr tmp2Reg, [tmpReg, iTableOffset] ; load vTableOffset + * sub x9, x9, tmp2Reg ; icallVMprJavaSendPatchupVirtual expects x9 to hold vTable index + * ldr tmpReg, [vftReg, x9] + * hitLabel: + * blr tmpReg + * doneLabel: */ int32_t blrInstr = *reinterpret_cast(callSite); - /* Check if the instruction at the callSite is 'blr x9' */ - if (blrInstr != 0xd63f0120) + /* Check if the instruction at the callSite is 'blr x10' */ + if (blrInstr != 0xd63f0140) { return false; } - int32_t ldrInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH); - /* Check if the instruction before blr is 'ldrx x9, label' */ - if ((ldrInst & 0xff00001f) != 0x58000009) + intptr_t ldrInstAddr = reinterpret_cast(callSite) - ARM64_INSTRUCTION_LENGTH; + int32_t ldrInst = *reinterpret_cast(ldrInstAddr); + /* Check if the instruction before blr is 'ldrx x10, label' */ + if ((ldrInst & 0xff00001f) == 0x5800000a) { - return false; + /* distance is encoded in bit 5-23 */ + int64_t distance = ((ldrInst << 8) >> 13) * 4; + intptr_t secondBranchAddressSlotAddr = ldrInstAddr + distance; + // The layout of the cache slots is as follows: + // +---------+---------------+---------+---------------+ + // | class1 |method address1| class2 |method address2| + // +---------+---------------+---------+---------------+ + addrOfFirstClassSlot = secondBranchAddressSlotAddr - sizeof(intptr_t) * 3; + + int32_t bneInst = *reinterpret_cast(ldrInstAddr - ARM64_INSTRUCTION_LENGTH); + /* Check if the instruction before ldr is 'bne' */ + if ((bneInst & 0xff00001f) != 0x54000001) + { + return false; + } + + int32_t cmpInst = *reinterpret_cast(ldrInstAddr - ARM64_INSTRUCTION_LENGTH * 2); + /* Check if the instruction before bne is 'cmp vftReg, x10' */ + if ((cmpInst & 0xfffffc1f) != 0xeb0a001f) + { + return false; + } + + return true; } - /* distance is encoded in bit 5-23 */ - int64_t distance = ((ldrInst << 8) >> 13) * 4; - intptr_t secondBranchAddressSlotAddr = reinterpret_cast(callSite) - ARM64_INSTRUCTION_LENGTH + distance; - // The layout of the cache slots is as follows: - // +---------+---------------+---------+---------------+ - // | class1 |method address1| class2 |method address2| - // +---------+---------------+---------+---------------+ - addrOfFirstClassSlot = secondBranchAddressSlotAddr - sizeof(intptr_t) * 3; - - int32_t bneInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 2); - /* Check if the instruction before ldr is 'bne' */ - if ((bneInst & 0xff00001f) != 0x54000001) + else if ((ldrInst & 0xfffffc1f) == 0xf869680a) /* Check if lastITable cache sequence is generated. The instruction before blr should be `ldr x10, [vftReg, x9]` */ { - return false; - } + int32_t subInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 2); + /* Check if the instruction before ldr is `sub x9, x9, x11` */ + if (subInst != 0xcb0b0129) + { + return false; + } + ldrInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 3); + /* Check if the instruction before sub is `ldr x11, [x10, #offset]` */ + if ((ldrInst & 0xffc003ff) != 0xf940014b) + { + return false; + } + int32_t movInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 4); + /* Check if the instruction before ldr is `mov w9, sizeof(J9Class)` */ + if ((movInst & 0xffe0001f) != 0x52800009) + { + return false; + } + intptr_t bcondInstAddr = reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 5); + int32_t bcondInst = *reinterpret_cast(bcondInstAddr); + /* check if the instruction before mov is `b.cond snippetLabel` */ + if ((bcondInst & 0xff000010) != 0x54000000) + { + return false; + } + /* distance is encoded in bit 5-23 */ + int64_t distance = ((bcondInst << 8) >> 13) * 4; + /* offset of the fist class slot in interface call snippet */ + static const int64_t firstClassSlotOffset = 44; + addrOfFirstClassSlot = bcondInstAddr + distance + firstClassSlotOffset; - int32_t cmpInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 3); - /* Check if the instruction before bne is 'cmp vftReg, x9' */ - if ((cmpInst & 0xfffffc1f) != 0xeb09001f) - { - return false; + return true; } - return true; + return false; } bool arm64CodePatching(void *callee, void *callSite, void *currentPC, void *currentTramp, void *newAddrOfCallee, void *extra)