diff --git a/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp b/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp index 5d2a02dd290..f19e95eb9d2 100644 --- a/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp +++ b/runtime/compiler/aarch64/codegen/ARM64PrivateLinkage.cpp @@ -1665,51 +1665,141 @@ static void buildVirtualCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Regi /** * @brief Generates instruction sequence for interface call * - * @param[in] cg: code generator - * @param[in] callNode: node for the interface call - * @param[in] vftReg: vft register - * @param[in] tmpReg: temporary register - * @param[in] tmp2Reg: temporary register - * @param[in] ifcSnippet: interface call snippet - * @param[in] regMapForGC: register map for GC + * @param[in] cg: code generator + * @param[in] callNode: node for the interface call + * @param[in] vftReg: vft register + * @param[in] x9Reg: temporary register + * @param[in] x10Reg: temporary register + * @param[in] x11Reg: temporary register + * @param[in] useLastITableCache: if true, use last iTable cache + * @param[in] ifcSnippet: interface call snippet + * @param[in] regMapForGC: register map for GC */ -static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Register *vftReg, TR::Register *tmpReg, TR::Register *tmp2Reg, TR::ARM64InterfaceCallSnippet *ifcSnippet, uint32_t regMapForGC) +static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Register *vftReg, TR::Register *x9Reg, TR::Register *x10Reg, TR::Register *x11Reg, + bool useLastITableCache, TR::ARM64InterfaceCallSnippet *ifcSnippet, uint32_t regMapForGC) { /* * Generating following instruction sequence. * Recompilation is dependent on this instruction sequence. * Please do not modify without changing recompilation code. * - * ldrx tmpReg, L_firstClassCacheSlot - * cmpx vftReg, tmpReg - * ldrx tmpReg, L_firstBranchAddressCacheSlot - * beq hitLabel - * ldrx tmpReg, L_secondClassCacheSlot - * cmpx vftReg, tmpReg - * bne snippetLabel - * ldrx tmpReg, L_secondBranchAddressCacheSlot - * hitLabel: - * blr tmpReg - * doneLabel: + * if useLastITableCache is false: + * if debug counters are disabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * beq hitLabel + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * bne snippetLabel + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * hitLabel: + * blr x10Reg + * doneLabel: + * + * if debug counters are enabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot1MissedLabel + * ; debug counter, trashes x9 and x11Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * b hitLabel + * slot1MissedLabel: + * beq hitLabel + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot2MissedLabel + * ; debug counter, trashes x9 and x11Reg + * b slot2DoneLabel + * slot2MissedLabel: + * ; debug counter, trashes x9 and x11Reg + * slot2DoneLabel: + * cmpx vftReg, x10Reg + * bne snippetLabel + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * hitLabel: + * blr x10Reg + * doneLabel: + * + * if useLastITableCache is true: + * if debug counters are disabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * beq hitLabel + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * beq hitLabel + * ldr x10Reg, [vftReg, lastITableOffset] ; cached iTable + * ldrx x9, L_interfaceClassSlot ; actual interfaceClass + * ldr x11Reg, [x10Reg, interfaceClassOffset]; interfaceClass in lastITable + * cmpx x9, x11Reg + * bal snippetLabel ; will be patched to bne + * mov w9, sizeof(J9Class) + * ldr x11Reg, [x10Reg, iTableOffset] ; load vTableOffset + * sub x9, x9, x11Reg ; icallVMprJavaSendPatchupVirtual expects x9 to hold vTable index + * ldr x10Reg, [vftReg, x9] + * hitLabel: + * blr x10Reg + * doneLabel: + * + * if debug counters are enabled: + * ldrx x10Reg, L_firstClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot1MissedLabel + * ; debug counter, trashes x9 and x11Reg + * ldrx x10Reg, L_firstBranchAddressCacheSlot + * b hitLabel + * slot1MissedLabel: + * ldrx x10Reg, L_secondClassCacheSlot + * cmpx vftReg, x10Reg + * bne slot2DoneLabel + * ; debug counter, trashes x9 and x11Reg + * slot2DoneLabel: + * cmpx vftReg, x10Reg + * ldrx x10Reg, L_secondBranchAddressCacheSlot + * beq hitLabel + * ldr x10Reg, [vftReg, lastITableOffset] ; cached iTable + * ldrx x9, L_interfaceClassSlot ; actual interfaceClass + * ldr x11Reg, [x10Reg, interfaceClassOffset]; interfaceClass in lastITable + * cmpx x9, x11Reg + * bne lastITableMissedLabel + * ; debug counter, trashes x9 and x11Reg + * cmpx x9, x9 ; to set Z flag + * b lastITableDoneLabel + * lastITableMissedLabel: + * ; debug counter, trashes x9 and x11Reg + * cmp x10Reg, #0 ; to unset Z flag + * lastITableDoneLabel: + * bal snippetLabel ; will be patched to bne + * mov w9, sizeof(J9Class) + * ldr x11Reg, [x10Reg, iTableOffset] ; load vTableOffset + * sub x9, x9, x11Reg ; icallVMprJavaSendPatchupVirtual expects x9 to hold vTable index + * ldr x10Reg, [vftReg, x9] + * hitLabel: + * blr x10Reg + * doneLabel: */ TR::LabelSymbol *ifcSnippetLabel = ifcSnippet->getSnippetLabel(); TR::LabelSymbol *firstClassCacheSlotLabel = ifcSnippet->getFirstClassCacheSlotLabel(); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, firstClassCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, firstClassCacheSlotLabel); TR::LabelSymbol *hitLabel = generateLabelSymbol(cg); - generateCompareInstruction(cg, callNode, vftReg, tmpReg, true); + generateCompareInstruction(cg, callNode, vftReg, x10Reg, true); TR::LabelSymbol *firstBranchAddressCacheSlotLabel = ifcSnippet->getFirstBranchAddressCacheSlotLabel(); TR::Compilation *comp = cg->comp(); TR_Debug *debugObj = cg->getDebug(); TR_ARM64ScratchRegisterManager *srm = NULL; bool isDebugCounterGenerated = false; - if (comp->getOptions()->enableDebugCounters()) + const bool enableDebugCounters = comp->getOptions()->enableDebugCounters(); + if (enableDebugCounters) { srm = cg->generateScratchRegisterManager(2); - srm->donateScratchRegister(tmpReg); - srm->donateScratchRegister(tmp2Reg); + srm->donateScratchRegister(x9Reg); + srm->donateScratchRegister(x11Reg); TR::Instruction *prevCursor = cg->getAppendInstruction(); /* Record if slot 1 hit */ TR::Instruction *cursor = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/slot1", @@ -1723,7 +1813,7 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re /* Debug counter was generated. Generating instructions before debug counter instructions. */ TR::LabelSymbol *slot1MissedLabel = generateLabelSymbol(cg); TR::Instruction *branchToSlot1MissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot1MissedLabel, TR::CC_NE, prevCursor); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, firstBranchAddressCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, firstBranchAddressCacheSlotLabel); TR::Instruction *branchToHitLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, hitLabel); TR::Instruction *slot1MissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot1MissedLabel); if (debugObj) @@ -1736,7 +1826,7 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re } if (!isDebugCounterGenerated) { - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, firstBranchAddressCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, firstBranchAddressCacheSlotLabel); TR::Instruction *branchToHitLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, hitLabel, TR::CC_EQ); if (debugObj) { @@ -1746,12 +1836,10 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re TR::LabelSymbol *secondClassCacheSlotLabel = ifcSnippet->getSecondClassCacheSlotLabel(); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, secondClassCacheSlotLabel); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, secondClassCacheSlotLabel); - if (comp->getOptions()->enableDebugCounters()) + if (enableDebugCounters) { - TR::LabelSymbol *slot2MissedLabel = generateLabelSymbol(cg); - TR::LabelSymbol *slot2DoneLabel = generateLabelSymbol(cg); TR::Instruction *prevCursor1 = cg->getAppendInstruction(); /* Record if slot 2 hit */ TR::Instruction *cursor1 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/slot2", @@ -1759,51 +1847,158 @@ static void buildInterfaceCall(TR::CodeGenerator *cg, TR::Node *callNode, TR::Re comp->getHotnessName(), callNode->getByteCodeInfo().getCallerIndex(), callNode->getByteCodeInfo().getByteCodeIndex()), *srm); - TR::Instruction *prevCursor2 = cg->getAppendInstruction(); - /* Record if slot 2 missed */ - TR::Instruction *cursor2 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/cachemiss", + + if (!useLastITableCache) + { + TR::Instruction *prevCursor2 = cg->getAppendInstruction(); + /* Record if slot 2 missed */ + TR::Instruction *cursor2 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/cachemiss", comp->signature(), comp->getHotnessName(), callNode->getByteCodeInfo().getCallerIndex(), callNode->getByteCodeInfo().getByteCodeIndex()), *srm); - if ((prevCursor1 != cursor1) || (prevCursor2 != cursor2)) + if ((prevCursor1 != cursor1) || (prevCursor2 != cursor2)) + { + TR::LabelSymbol *slot2MissedLabel = generateLabelSymbol(cg); + TR::LabelSymbol *slot2DoneLabel = generateLabelSymbol(cg); + /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for second cache slot. */ + TR::Instruction *cursor = generateCompareInstruction(cg, callNode, vftReg, x10Reg, true, prevCursor1); + TR::Instruction *branchToSlot2MissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot2MissedLabel, TR::CC_NE, cursor); + + /* Generating instructions before debug counter instructions recording cache miss. */ + cursor = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, slot2DoneLabel, prevCursor2); + TR::Instruction *slot2MissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2MissedLabel, cursor); + + /* Generating instructions after debug counter instructions. */ + TR::Instruction *slot2DoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2DoneLabel); + if (debugObj) + { + debugObj->addInstructionComment(branchToSlot2MissedLabelInstr, "Jumps to slot2MissedLabel"); + debugObj->addInstructionComment(cursor, "Jumps to slot2DoneLabel"); + debugObj->addInstructionComment(slot2MissedLabelInstr, "slot2MissedLabel"); + debugObj->addInstructionComment(slot2DoneLabelInstr, "slot2DoneLabel"); + } + } + } + else { - /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for second cache slot. */ - TR::Instruction *cursor = generateCompareInstruction(cg, callNode, vftReg, tmpReg, true, prevCursor1); - TR::Instruction *branchToSlot2MissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot2MissedLabel, TR::CC_NE, cursor); + if (prevCursor1 != cursor1) + { + TR::LabelSymbol *slot2DoneLabel = generateLabelSymbol(cg); + /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for second cache slot. */ + TR::Instruction *cursor = generateCompareInstruction(cg, callNode, vftReg, x10Reg, true, prevCursor1); + TR::Instruction *branchToSlot2DoneLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, slot2DoneLabel, TR::CC_NE, cursor); + /* Generating instructions after debug counter instructions. */ + TR::Instruction *slot2DoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2DoneLabel); + if (debugObj) + { + debugObj->addInstructionComment(branchToSlot2DoneLabelInstr, "Jumps to slot2DoneLabel"); + debugObj->addInstructionComment(slot2DoneLabelInstr, "slot2DoneLabel"); + } + } + } + } - /* Generating instructions before debug counter instructions recording cache miss. */ - cursor = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, slot2DoneLabel, prevCursor2); - TR::Instruction *slot2MissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2MissedLabel, cursor); + generateCompareInstruction(cg, callNode, vftReg, x10Reg, true); - /* Generating instructions after debug counter instructions. */ - TR::Instruction *slot2DoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, slot2DoneLabel); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, secondClassCacheSlotLabel); - if (debugObj) + TR::SymbolReference *methodSymRef = callNode->getSymbolReference(); + TR_ResolvedMethod *owningMethod = methodSymRef->getOwningMethod(comp); + uintptr_t itableIndex; + TR_OpaqueClassBlock *interfaceClassOfMethod = owningMethod->getResolvedInterfaceMethod(methodSymRef->getCPIndex(), &itableIndex); + + TR::Instruction *gcPoint; + if (useLastITableCache && (interfaceClassOfMethod != NULL)) + { + TR_J9VMBase *fej9 = cg->fej9(); + + TR::LabelSymbol *secondBranchAddressCacheSlotLabel = ifcSnippet->getSecondBranchAddressCacheSlotLabel(); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, secondBranchAddressCacheSlotLabel); + generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, hitLabel, TR::CC_EQ); + + generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, callNode, x10Reg, TR::MemoryReference::createWithDisplacement(cg, vftReg, fej9->getOffsetOfLastITableFromClassField())); + TR::LabelSymbol *interfacedClassSlotLabel = ifcSnippet->getInterfaceClassSlotLabel(); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x9Reg, 0, interfacedClassSlotLabel); + generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, callNode, x11Reg, TR::MemoryReference::createWithDisplacement(cg, x10Reg, fej9->getOffsetOfInterfaceClassFromITableField())); + generateCompareInstruction(cg, callNode, x9Reg, x11Reg, true); + + if (enableDebugCounters) + { + TR::Instruction *prevCursor1 = cg->getAppendInstruction(); + /* Record if lastITable cache hit */ + TR::Instruction *cursor1 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/lastITable", + comp->signature(), + comp->getHotnessName(), + callNode->getByteCodeInfo().getCallerIndex(), + callNode->getByteCodeInfo().getByteCodeIndex()), *srm); + TR::Instruction *prevCursor2 = cg->getAppendInstruction(); + /* Record if lastITable cache missed */ + TR::Instruction *cursor2 = cg->generateDebugCounter(TR::DebugCounter::debugCounterName(comp, "cg.callInterface/(%s)/%s/%d/%d/dynamicPIC/cachemiss", + comp->signature(), + comp->getHotnessName(), + callNode->getByteCodeInfo().getCallerIndex(), + callNode->getByteCodeInfo().getByteCodeIndex()), *srm); + if ((prevCursor1 != cursor1) || (prevCursor2 != cursor2)) { - debugObj->addInstructionComment(branchToSlot2MissedLabelInstr, "Jumps to slot2MissedLabel"); - debugObj->addInstructionComment(cursor, "Jumps to slot2DoneLabel"); - debugObj->addInstructionComment(slot2MissedLabelInstr, "slot2MissedLabel"); - debugObj->addInstructionComment(slot2DoneLabelInstr, "slot2DoneLabel"); + TR::LabelSymbol *lastITableMissedLabel = generateLabelSymbol(cg); + TR::LabelSymbol *lastITableDoneLabel = generateLabelSymbol(cg); + /* Debug counter was generated. Generating instructions before debug counter instructions recording hit for lastITable cache. */ + TR::Instruction *branchToLastITableMissedLabelInstr = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, lastITableMissedLabel, TR::CC_NE, prevCursor1); + + /* Generating instructions before debug counter instructions recording cache miss. */ + TR::Instruction *cmpInstr1 = generateCompareInstruction(cg, callNode, x9Reg, x9Reg, true, prevCursor2); /* to set Z flag */ + TR::Instruction *cursor = generateLabelInstruction(cg, TR::InstOpCode::b, callNode, lastITableDoneLabel, cmpInstr1); + TR::Instruction *lastITableMissedLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, lastITableMissedLabel, cursor); + + /* Generating instructions after debug counter instructions. */ + generateCompareImmInstruction(cg, callNode, x10Reg, 0, true); /* to unset Z flag */ + TR::Instruction *lastITableDoneLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, lastITableDoneLabel); + if (debugObj) + { + debugObj->addInstructionComment(branchToLastITableMissedLabelInstr, "Jumps to lastITableMissedLabel"); + debugObj->addInstructionComment(cursor, "Jumps to lastITableDoneLabel"); + debugObj->addInstructionComment(lastITableMissedLabelInstr, "lastITable2MissedLabel"); + debugObj->addInstructionComment(lastITableDoneLabelInstr, "lastITableDoneLabel"); + } } } - srm->stopUsingRegisters(); + + /* This conditional branch instruction with "always" condition code will be patched to b.ne instruction after second cache slot is filled. */ + gcPoint = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, ifcSnippetLabel, TR::CC_AL); + loadConstant32(cg, callNode, fej9->getITableEntryJitVTableOffset(), x9Reg); + generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, callNode, x11Reg, TR::MemoryReference::createWithDisplacement(cg, x10Reg, fej9->convertITableIndexToOffset(itableIndex))); + /* PicBuilder.spp checks this instruction. It needs to be 'sub x9, x9, x11'. */ + generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, callNode, x9Reg, x9Reg, x11Reg); + generateTrg1MemInstruction(cg, TR::InstOpCode::ldroffx, callNode, x10Reg, TR::MemoryReference::createWithIndexReg(cg, vftReg, x9Reg)); + if (debugObj) + { + debugObj->addInstructionComment(gcPoint, "Jumps to Snippet. Will be patched to b.ne"); + } } + else + { + gcPoint = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, ifcSnippetLabel, TR::CC_NE); + TR::LabelSymbol *secondBranchAddressCacheSlotLabel = ifcSnippet->getSecondBranchAddressCacheSlotLabel(); - generateCompareInstruction(cg, callNode, vftReg, tmpReg, true); - TR::Instruction *gcPoint = generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, callNode, ifcSnippetLabel, TR::CC_NE); + generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, x10Reg, 0, secondBranchAddressCacheSlotLabel); + if (debugObj) + { + debugObj->addInstructionComment(gcPoint, "Jumps to snippet"); + } + } gcPoint->ARM64NeedsGCMap(cg, regMapForGC); - TR::LabelSymbol *secondBranchAddressCacheSlotLabel = ifcSnippet->getSecondBranchAddressCacheSlotLabel(); - generateTrg1ImmSymInstruction(cg, TR::InstOpCode::ldrx, callNode, tmpReg, 0, secondBranchAddressCacheSlotLabel); + if (enableDebugCounters) + { + srm->stopUsingRegisters(); + } TR::Instruction *hitLabelInstr = generateLabelInstruction(cg, TR::InstOpCode::label, callNode, hitLabel); if (debugObj) { - debugObj->addInstructionComment(gcPoint, "Jumps to snippet"); debugObj->addInstructionComment(hitLabelInstr, "hitLabel"); } - gcPoint = generateRegBranchInstruction(cg, TR::InstOpCode::blr, callNode, tmpReg); + gcPoint = generateRegBranchInstruction(cg, TR::InstOpCode::blr, callNode, x10Reg); gcPoint->ARM64NeedsGCMap(cg, regMapForGC); + } static TR::Register *evaluateUpToVftChild(TR::Node *callNode, TR::CodeGenerator *cg) @@ -1826,6 +2021,7 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, TR::Register *x0 = dependencies->searchPreConditionRegister(TR::RealRegister::x0); TR::Register *x9 = dependencies->searchPreConditionRegister(TR::RealRegister::x9); TR::Register *x10 = dependencies->searchPreConditionRegister(TR::RealRegister::x10); + TR::Register *x11 = dependencies->searchPreConditionRegister(TR::RealRegister::x11); TR::SymbolReference *methodSymRef = callNode->getSymbolReference(); TR::MethodSymbol *methodSymbol = methodSymRef->getSymbol()->castToMethodSymbol(); @@ -2045,6 +2241,8 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, } } + bool useLastITableCache = !comp()->getOption(TR_DisableLastITableCache); + // Profile-driven virtual and interface calls // // If the top value dominates everything else, generate a single static @@ -2066,6 +2264,32 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, ListIterator i(&values); J9::ARM64PICItem *pic = i.getFirst(); + if (useLastITableCache && methodSymbol->isInterface()) + { + // Find the class pointer to the interface class if it is already loaded. + // + TR::Method *interfaceMethod = methodSymbol->getMethod(); + int32_t len = interfaceMethod->classNameLength(); + char *s = TR::Compiler->cls.classNameToSignature(interfaceMethod->classNameChars(), len, comp()); + auto interfaceClassOfMethod = fej9->getClassFromSignature(s, len, methodSymRef->getOwningMethod(comp())); + int32_t numStaticPICSlots = (pic->_frequency > MAX_PROFILED_CALL_FREQUENCY) ? 1 : values.getSize(); + + // Disable lastITable logic if all the implementers can fit into the pic slots during non-startup state + if (interfaceClassOfMethod && comp()->getPersistentInfo()->getJitState() != STARTUP_STATE) + { + int32_t numPICSlots = numStaticPICSlots + 2; + TR_ResolvedMethod **implArray = new (comp()->trStackMemory()) TR_ResolvedMethod *[numPICSlots+1]; + TR_PersistentCHTable *chTable = comp()->getPersistentInfo()->getPersistentCHTable(); + int32_t cpIndex = methodSymRef->getCPIndex(); + int32_t numImplementers = chTable->findnInterfaceImplementers(interfaceClassOfMethod, numPICSlots+1, implArray, cpIndex, methodSymRef->getOwningMethod(comp()), comp()); + if (numImplementers <= numPICSlots) + { + useLastITableCache = false; + if (comp()->getOption(TR_TraceCG)) + traceMsg(comp(),"Found %d implementers for call to %s, can be fit into %d pic slots, disabling lastITable cache\n", numImplementers, methodSymbol->getMethod()->signature(comp()->trMemory()), numPICSlots); + } + } + } // If this value is dominant, optimize exclusively for it if (pic->_frequency > MAX_PROFILED_CALL_FREQUENCY) { @@ -2100,11 +2324,13 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, TR::LabelSymbol *firstBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondClassCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); + TR::LabelSymbol *interfaceClassSlotLabel = generateLabelSymbol(cg()); TR::ARM64InterfaceCallSnippet *ifcSnippet = new (trHeapMemory()) TR::ARM64InterfaceCallSnippet(cg(), callNode, ifcSnippetLabel, argSize, doneOOLLabel, firstClassCacheSlotLabel, secondClassCacheSlotLabel, - firstBranchAddressCacheSlotLabel, secondBranchAddressCacheSlotLabel, static_cast(thunk)); + firstBranchAddressCacheSlotLabel, secondBranchAddressCacheSlotLabel, + interfaceClassSlotLabel, static_cast(thunk)); cg()->addSnippet(ifcSnippet); - buildInterfaceCall(cg(), callNode, vftReg, x9, x10, ifcSnippet, regMapForGC); + buildInterfaceCall(cg(), callNode, vftReg, x9, x10, x11, useLastITableCache, ifcSnippet, regMapForGC); } else { @@ -2153,12 +2379,14 @@ void J9::ARM64::PrivateLinkage::buildVirtualDispatch(TR::Node *callNode, TR::LabelSymbol *firstBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondClassCacheSlotLabel = generateLabelSymbol(cg()); TR::LabelSymbol *secondBranchAddressCacheSlotLabel = generateLabelSymbol(cg()); + TR::LabelSymbol *interfaceClassSlotLabel = generateLabelSymbol(cg()); TR::ARM64InterfaceCallSnippet *ifcSnippet = new (trHeapMemory()) - TR::ARM64InterfaceCallSnippet(cg(), callNode, ifcSnippetLabel, argSize, doneLabel, firstClassCacheSlotLabel, firstBranchAddressCacheSlotLabel, secondClassCacheSlotLabel, secondBranchAddressCacheSlotLabel, static_cast(thunk)); + TR::ARM64InterfaceCallSnippet(cg(), callNode, ifcSnippetLabel, argSize, doneLabel, firstClassCacheSlotLabel, firstBranchAddressCacheSlotLabel, + secondClassCacheSlotLabel, secondBranchAddressCacheSlotLabel, interfaceClassSlotLabel, static_cast(thunk)); cg()->addSnippet(ifcSnippet); - buildInterfaceCall(cg(), callNode, vftReg, x9, x10, ifcSnippet, regMapForGC); + buildInterfaceCall(cg(), callNode, vftReg, x9, x10, x11, useLastITableCache, ifcSnippet, regMapForGC); } else { diff --git a/runtime/compiler/aarch64/codegen/CallSnippet.cpp b/runtime/compiler/aarch64/codegen/CallSnippet.cpp index 0c780ad63c6..0d80e7dc854 100644 --- a/runtime/compiler/aarch64/codegen/CallSnippet.cpp +++ b/runtime/compiler/aarch64/codegen/CallSnippet.cpp @@ -811,6 +811,7 @@ uint8_t *TR::ARM64InterfaceCallSnippet::emitSnippetBody() cursor += sizeof(intptr_t); // 2 slots for resolved values (interface class and iTable index) + _interfaceClassSlotLabel->setCodeLocation(cursor); *reinterpret_cast(cursor) = 0; cursor += sizeof(intptr_t); *reinterpret_cast(cursor) = 0; diff --git a/runtime/compiler/aarch64/codegen/CallSnippet.hpp b/runtime/compiler/aarch64/codegen/CallSnippet.hpp index 33e3ee96d94..776e60a435b 100644 --- a/runtime/compiler/aarch64/codegen/CallSnippet.hpp +++ b/runtime/compiler/aarch64/codegen/CallSnippet.hpp @@ -134,23 +134,28 @@ class ARM64InterfaceCallSnippet : public TR::ARM64VirtualSnippet TR::LabelSymbol *_firstBranchAddressCacheSlotLabel; TR::LabelSymbol *_secondClassCacheSlotLabel; TR::LabelSymbol *_secondBranchAddressCacheSlotLabel; + TR::LabelSymbol *_interfaceClassSlotLabel; public: ARM64InterfaceCallSnippet(TR::CodeGenerator *cg, TR::Node *c, TR::LabelSymbol *lab, int32_t s, TR::LabelSymbol *retl, TR::LabelSymbol *firstClassCacheSlotLabel, TR::LabelSymbol *firstBranchAddressCacheSlotLabel, - TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel) + TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel, + TR::LabelSymbol *interfaceClassSlotLabel) : TR::ARM64VirtualSnippet(cg, c, lab, s, retl), thunkAddress(NULL), _firstClassCacheSlotLabel(firstClassCacheSlotLabel), _firstBranchAddressCacheSlotLabel(firstBranchAddressCacheSlotLabel), - _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel) + _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel), + _interfaceClassSlotLabel(interfaceClassSlotLabel) { } ARM64InterfaceCallSnippet(TR::CodeGenerator *cg, TR::Node *c, TR::LabelSymbol *lab, int32_t s, TR::LabelSymbol *retl, TR::LabelSymbol *firstClassCacheSlotLabel, TR::LabelSymbol *firstBranchAddressCacheSlotLabel, - TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel, uint8_t *thunkPtr) + TR::LabelSymbol *secondClassCacheSlotLabel, TR::LabelSymbol *secondBranchAddressCacheSlotLabel, + TR::LabelSymbol *interfaceClassSlotLabel, uint8_t *thunkPtr) : TR::ARM64VirtualSnippet(cg, c, lab, s, retl), thunkAddress(thunkPtr), _firstClassCacheSlotLabel(firstClassCacheSlotLabel), _firstBranchAddressCacheSlotLabel(firstBranchAddressCacheSlotLabel), - _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel) + _secondClassCacheSlotLabel(secondClassCacheSlotLabel), _secondBranchAddressCacheSlotLabel(secondBranchAddressCacheSlotLabel), + _interfaceClassSlotLabel(interfaceClassSlotLabel) { } @@ -158,6 +163,7 @@ class ARM64InterfaceCallSnippet : public TR::ARM64VirtualSnippet TR::LabelSymbol *getFirstBranchAddressCacheSlotLabel() { return _firstBranchAddressCacheSlotLabel; } TR::LabelSymbol *getSecondClassCacheSlotLabel() { return _secondClassCacheSlotLabel; } TR::LabelSymbol *getSecondBranchAddressCacheSlotLabel() { return _secondBranchAddressCacheSlotLabel; } + TR::LabelSymbol *getInterfaceClassSlotLabel() { return _interfaceClassSlotLabel; } virtual Kind getKind() { return IsInterfaceCall; } virtual uint8_t *emitSnippetBody(); diff --git a/runtime/compiler/aarch64/runtime/PicBuilder.spp b/runtime/compiler/aarch64/runtime/PicBuilder.spp index 605e5f4c027..eca4d36150f 100644 --- a/runtime/compiler/aarch64/runtime/PicBuilder.spp +++ b/runtime/compiler/aarch64/runtime/PicBuilder.spp @@ -119,6 +119,9 @@ .set J9TR_ICSnippet_SecondTarget, 64 .set J9TR_ICSnippet_J2IThunk, 72 + .set J9TR_ICCodeCacheRA_Sub, -12 + .set J9TR_ICCodeCacheRA_BCond, -24 + // Unresolved virtual call snippet .set J9TR_UVCSnippet_codeCacheReturnAddress, 0 @@ -1033,11 +1036,37 @@ L_tryToCompleteSlot2: ldr x3, [x1, J9TR_J9Class_classLoader] ldr x4, [x2, J9TR_J9Class_classLoader] cmp x3, x4 // Same classLoader? - beq L_commonJitDispatch + beq L_patchBranchInstIfLastITableCacheIsUsed // Skip pic registration mov x6, #J9TR_ICSnippet_SecondClass // slot2 class offset bl L_picRegistration + +L_patchBranchInstIfLastITableCacheIsUsed: + ldr w1, [x10, J9TR_ICCodeCacheRA_Sub] // Load instruction at 12 bytes before code cache RA + ldr w2, const_InstructionLastITableCache + cmp w1, w2 + bne L_commonJitDispatch // If lastITableCache check is not generated, goto L_commonJitDispatch + ldr w1, [x10, J9TR_ICCodeCacheRA_BCond] // Load instruction at 24 bytes before code cache RA + lsr w2, w1, #24 + cmp w2, 0x54 // Test if the bit 24-31 is 0x54 (b.cond) + and w2, w1, #0x1f + ccmp w2, #0xe, #1, eq // Test if condition code is AL and bit 4 is 0 + bne L_commonJitDispatch + + bfxil w1, wzr, #0, #4 // Clear condition code + orr w1, w1, #1 // Set condition code to NE + START_MODIFYING_CODE + str w1, [x10, J9TR_ICCodeCacheRA_BCond] // Update b.al to b.ne + FINISH_MODIFYING_CODE + mov x8, x0 // Preserve x0 (vtable offset) in x8 + add x0, x10, #J9TR_ICCodeCacheRA_BCond // Address of B.cond instruction + mov x1, #4 // 1 instruction to flush + bl flushICache + mov x0, x8 b L_commonJitDispatch +const_InstructionLastITableCache: + sub x9, x9, x11 // The instruction to be compared + L_exitTryToCompleteSlot2: FINISH_MODIFYING_CODE b L_commonJitDispatch diff --git a/runtime/compiler/runtime/Trampoline.cpp b/runtime/compiler/runtime/Trampoline.cpp index b04b1d4b746..ac409eb173f 100644 --- a/runtime/compiler/runtime/Trampoline.cpp +++ b/runtime/compiler/runtime/Trampoline.cpp @@ -1006,59 +1006,110 @@ static bool isInterfaceCallSite(uint8_t *callSite, intptr_t& addrOfFirstClassSlo { /* * Following instruction sequence is used for interface call. - * We can assume tmpReg is x9. + * We can assume tmpReg is x10. * Searching for the last 4 instructions * - * ldrx tmpReg, L_firstClassCacheSlot - * cmpx vftReg, tmpReg - * ldrx tmpReg, L_firstBranchAddressCacheSlot - * beq hitLabel - * ldrx tmpReg, L_secondClassCacheSlot - * cmpx vftReg, tmpReg - * bne snippetLabel - * ldrx tmpReg, L_secondBranchAddressCacheSlot - * hitLabel: - * blr tmpReg - * doneLabel: + * If the instruction before `blr tmpReg` is `ldr tmpReg, label`, + * then the lastITable cache is not used. + * In that case, we expect the below instructions before the call site. + * We obtain the address of the secondBranchAddressCacheSlot from `ldrx` instruction. + * + * cmpx vftReg, tmpReg + * bne snippetLabel + * ldrx tmpReg, L_secondBranchAddressCacheSlot + * hitLabel: + * blr tmpReg + * doneLabel: + * + * If the instruction before `blr tmpReg` is `ldr tmpReg, [vftReg, x9]`, + * then the lastITable cache is used. + * In that case, we expect the below instructions before the call site. + * We get the address of the interface call snippet from `bal snippetLabel` instruction. + * + * bal snippetLabel ; probably already patched to bne + * mov w9, sizeof(J9Class) + * ldr tmp2Reg, [tmpReg, iTableOffset] ; load vTableOffset + * sub x9, x9, tmp2Reg ; icallVMprJavaSendPatchupVirtual expects x9 to hold vTable index + * ldr tmpReg, [vftReg, x9] + * hitLabel: + * blr tmpReg + * doneLabel: */ int32_t blrInstr = *reinterpret_cast(callSite); - /* Check if the instruction at the callSite is 'blr x9' */ - if (blrInstr != 0xd63f0120) + /* Check if the instruction at the callSite is 'blr x10' */ + if (blrInstr != 0xd63f0140) { return false; } - int32_t ldrInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH); - /* Check if the instruction before blr is 'ldrx x9, label' */ - if ((ldrInst & 0xff00001f) != 0x58000009) + intptr_t ldrInstAddr = reinterpret_cast(callSite) - ARM64_INSTRUCTION_LENGTH; + int32_t ldrInst = *reinterpret_cast(ldrInstAddr); + /* Check if the instruction before blr is 'ldrx x10, label' */ + if ((ldrInst & 0xff00001f) == 0x5800000a) { - return false; + /* distance is encoded in bit 5-23 */ + int64_t distance = ((ldrInst << 8) >> 13) * 4; + intptr_t secondBranchAddressSlotAddr = ldrInstAddr + distance; + // The layout of the cache slots is as follows: + // +---------+---------------+---------+---------------+ + // | class1 |method address1| class2 |method address2| + // +---------+---------------+---------+---------------+ + addrOfFirstClassSlot = secondBranchAddressSlotAddr - sizeof(intptr_t) * 3; + + int32_t bneInst = *reinterpret_cast(ldrInstAddr - ARM64_INSTRUCTION_LENGTH); + /* Check if the instruction before ldr is 'bne' */ + if ((bneInst & 0xff00001f) != 0x54000001) + { + return false; + } + + int32_t cmpInst = *reinterpret_cast(ldrInstAddr - ARM64_INSTRUCTION_LENGTH * 2); + /* Check if the instruction before bne is 'cmp vftReg, x10' */ + if ((cmpInst & 0xfffffc1f) != 0xeb0a001f) + { + return false; + } + + return true; } - /* distance is encoded in bit 5-23 */ - int64_t distance = ((ldrInst << 8) >> 13) * 4; - intptr_t secondBranchAddressSlotAddr = reinterpret_cast(callSite) - ARM64_INSTRUCTION_LENGTH + distance; - // The layout of the cache slots is as follows: - // +---------+---------------+---------+---------------+ - // | class1 |method address1| class2 |method address2| - // +---------+---------------+---------+---------------+ - addrOfFirstClassSlot = secondBranchAddressSlotAddr - sizeof(intptr_t) * 3; - - int32_t bneInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 2); - /* Check if the instruction before ldr is 'bne' */ - if ((bneInst & 0xff00001f) != 0x54000001) + else if ((ldrInst & 0xfffffc1f) == 0xf869680a) /* Check if lastITable cache sequence is generated. The instruction before blr should be `ldr x10, [vftReg, x9]` */ { - return false; - } + int32_t subInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 2); + /* Check if the instruction before ldr is `sub x9, x9, x11` */ + if (subInst != 0xcb0b0129) + { + return false; + } + ldrInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 3); + /* Check if the instruction before sub is `ldr x11, [x10, #offset]` */ + if ((ldrInst & 0xffc003ff) != 0xf940014b) + { + return false; + } + int32_t movInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 4); + /* Check if the instruction before ldr is `mov w9, sizeof(J9Class)` */ + if ((movInst & 0xffe0001f) != 0x52800009) + { + return false; + } + intptr_t bcondInstAddr = reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 5); + int32_t bcondInst = *reinterpret_cast(bcondInstAddr); + /* check if the instruction before mov is `b.cond snippetLabel` */ + if ((bcondInst & 0xff000010) != 0x54000000) + { + return false; + } + /* distance is encoded in bit 5-23 */ + int64_t distance = ((bcondInst << 8) >> 13) * 4; + /* offset of the fist class slot in interface call snippet */ + static const int64_t firstClassSlotOffset = 44; + addrOfFirstClassSlot = bcondInstAddr + distance + firstClassSlotOffset; - int32_t cmpInst = *reinterpret_cast(callSite - ARM64_INSTRUCTION_LENGTH * 3); - /* Check if the instruction before bne is 'cmp vftReg, x9' */ - if ((cmpInst & 0xfffffc1f) != 0xeb09001f) - { - return false; + return true; } - return true; + return false; } bool arm64CodePatching(void *callee, void *callSite, void *currentPC, void *currentTramp, void *newAddrOfCallee, void *extra)