From dddcde59112fe3b25adb5a8550201743934beb7d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 25 Sep 2024 12:18:39 +0100 Subject: [PATCH] [VPlan] Remove loop region in optimizeForVFAndUF. --- .../Transforms/Vectorize/LoopVectorize.cpp | 75 ++++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 143 ++++++------- llvm/lib/Transforms/Vectorize/VPlan.h | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 44 +++- .../LoopVectorize/AArch64/call-costs.ll | 17 +- .../LoopVectorize/RISCV/low-trip-count.ll | 195 ++++++++---------- .../LoopVectorize/RISCV/short-trip-count.ll | 46 ++--- .../truncate-to-minimal-bitwidth-cost.ll | 27 +-- ...demanding-all-lanes-and-first-lane-only.ll | 58 +++--- .../LoopVectorize/X86/constant-fold.ll | 18 +- .../Transforms/LoopVectorize/X86/pr34438.ll | 26 +-- .../LoopVectorize/first-order-recurrence.ll | 33 +-- .../vector-loop-backedge-elimination.ll | 4 +- .../version-stride-with-integer-casts.ll | 15 +- 15 files changed, 339 insertions(+), 376 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b0831451ebdf0e..2b049f502c294d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2956,6 +2956,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (const auto &KV : Plan.getLiveOuts()) KV.second->fixPhi(Plan, State); + if (!isa(State.Plan->getEntry()->getSingleSuccessor())) + return; + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -7499,7 +7502,8 @@ LoopVectorizationPlanner::executePlan( LLVM_DEBUG(BestVPlan.dump()); // Perform the actual loop transformation. - VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan); + VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, + Legal->getWidestInductionType()); // 0. Generate SCEV-dependent code into the preheader, including TripCount, // before making any changes to the CFG. @@ -7561,14 +7565,15 @@ LoopVectorizationPlanner::executePlan( // 2.5 Collect reduction resume values. DenseMap ReductionResumeValues; - auto *ExitVPBB = - cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - if (IsEpilogueVectorization) + if (IsEpilogueVectorization) { + auto *ExitVPBB = cast( + BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); for (VPRecipeBase &R : *ExitVPBB) { updateAndCollectMergePhiForReductionForEpilogueVectorization( dyn_cast(&R), ReductionResumeValues, State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); } + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -7579,24 +7584,26 @@ LoopVectorizationPlanner::executePlan( makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - VPBasicBlock *HeaderVPBB = - BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) - L->setLoopID(*VectorizedLoopID); - else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); + if (auto *R = + dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { + VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) + L->setLoopID(*VectorizedLoopID); + else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) + addRuntimeUnrollDisableMetaData(L); } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) - addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7605,15 +7612,20 @@ LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (auto *R = + dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { + auto *ExitVPBB = cast(R->getSingleSuccessor()); + + auto *MiddleTerm = + cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return {State.ExpandedSCEVs, ReductionResumeValues}; @@ -9423,7 +9435,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present(FPBinOp)); DerivedIV->setName("offset.idx"); - assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); + assert((isa(CanonicalIV) || DerivedIV != CanonicalIV) && + "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6ddbfcf0ecfe58..624aa9c6469498 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -224,9 +224,10 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan) + InnerLoopVectorizer *ILV, VPlan *Plan, + Type *CanonicalIVTy) : VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {} + LVer(nullptr), TypeAnalysis(CanonicalIVTy) {} Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) { if (Def->isLiveIn()) @@ -275,8 +276,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { // Place the code for broadcasting invariant variables in the new preheader. IRBuilder<>::InsertPointGuard Guard(Builder); if (SafeToHoist) { - BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast( - Plan->getVectorLoopRegion()->getSinglePredecessor())]; + BasicBlock *LoopVectorPreHeader = + CFG.VPBB2IRBB[cast(Plan->getEntry())]; if (LoopVectorPreHeader) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); } @@ -417,6 +418,12 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { PrevBB->getParent(), CFG.ExitBB); LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); + connectToPredecessors(NewBB, CFG); + return NewBB; +} + +void VPBasicBlock::connectToPredecessors(BasicBlock *NewBB, + VPTransformState::CFGState &CFG) { // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); @@ -447,38 +454,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { } CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}}); } - return NewBB; } - void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator()); executeRecipes(State, getIRBasicBlock()); - if (getSingleSuccessor()) { - assert(isa(getIRBasicBlock()->getTerminator())); - auto *Br = State->Builder.CreateBr(getIRBasicBlock()); - Br->setOperand(0, nullptr); - getIRBasicBlock()->getTerminator()->eraseFromParent(); - } - - for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); - BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB]; - assert(PredBB && "Predecessor basic-block not found building successor."); - LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); - auto *PredBBTerminator = PredBB->getTerminator(); - auto *TermBr = cast(PredBBTerminator); - // Set each forward successor here when it is created, excluding - // backedges. A backward successor is set when the branch is created. - const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); - unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!TermBr->getSuccessor(idx) && - "Trying to reset an existing successor block."); - TermBr->setSuccessor(idx, IRBB); - State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}}); - } + connectToPredecessors(getIRBasicBlock(), State->CFG); } void VPBasicBlock::execute(VPTransformState *State) { @@ -954,7 +937,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); @@ -1026,8 +1008,13 @@ void VPlan::execute(VPTransformState *State) { // skeleton creation, so we can only create the VPIRBasicBlocks now during // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; - VPBasicBlock *MiddleVPBB = - cast(getVectorLoopRegion()->getSingleSuccessor()); + VPBlockBase *Leaf = nullptr; + for (VPBlockBase *VPB : vp_depth_first_shallow(getEntry())) + if (VPB->getNumSuccessors() == 0) { + Leaf = VPB; + break; + } + VPBasicBlock *MiddleVPBB = cast(Leaf->getSinglePredecessor()); // Find the VPBB for the scalar preheader, relying on the current structure // when creating the middle block and its successrs: if there's a single // predecessor, it must be the scalar preheader. Otherwise, the second @@ -1055,53 +1042,59 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) Block->execute(State); - VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); - BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; - - // Fix the latch value of canonical, reduction and first-order recurrences - // phis in the vector loop. - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - // Skip phi-like recipes that generate their backedege values themselves. - if (isa(&R)) - continue; - - if (isa(&R) || - isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); - } - - Phi->setIncomingBlock(1, VectorLatchBB); + if (auto *LoopRegion = + dyn_cast(getEntry()->getSingleSuccessor())) { + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + if (isa(&R)) + continue; - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + if (isa(&R) || + isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; + } - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; + // For canonical IV, first-order recurrences and in-order reduction phis, + // only a single part is generated, which provides the last part from the + // previous iteration. For non-ordered reductions all UF parts are + // generated. + auto *PhiR = cast(&R); + bool NeedsScalar = + isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); + Value *Phi = State->get(PhiR, NeedsScalar); + Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); } - - auto *PhiR = cast(&R); - bool NeedsScalar = - isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); - Value *Phi = State->get(PhiR, NeedsScalar); - Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); } - State->CFG.DTU.flush(); assert(State->CFG.DTU.getDomTree().verify( DominatorTree::VerificationLevel::Fast) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bbcfaf9e19cd0c..006c43990bc37f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -236,7 +236,7 @@ class VPLane { struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan); + InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy); /// The chosen Vectorization Factor of the loop being vectorized. ElementCount VF; @@ -3237,6 +3237,8 @@ class VPBasicBlock : public VPBlockBase { protected: /// Execute the recipes in the IR basic block \p BB. void executeRecipes(VPTransformState *State, BasicBlock *BB); + void connectToPredecessors(BasicBlock *NewBB, + VPTransformState::CFGState &CFG); private: /// Create an IR BasicBlock to hold the output instructions generated by this @@ -3354,6 +3356,7 @@ class VPRegionBlock : public VPBlockBase { assert(!isReplicator() && "should only get pre-header of loop regions"); return getSinglePredecessor()->getExitingBasicBlock(); } + void clearEntry() { Entry = nullptr; } /// An indicator whether this region is to generate multiple replicated /// instances of output IR corresponding to its VPBlockBases. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index dacba152611c19..8a28149dd867ba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -199,8 +199,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + auto *Region = dyn_cast(Plan.getEntry()->getSingleSuccessor()); + VPBasicBlock *MiddleVPBB = dyn_cast_or_null( + Region ? Region->getSingleSuccessor() : nullptr); VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; // Values leaving the vector loop reach live out phi's in the exiting block @@ -2066,7 +2067,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa(CurrentTerminator) && + assert((isa(CurrentTerminator) || + (isa(CurrentTerminator) && + !CurrentTerminator->getOperand(0))) && "Expected to replace unreachable terminator with conditional branch."); auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); CondBr->setSuccessor(0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a878613c4ba483..6e1e5cd7b89d7c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -695,16 +695,46 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - LLVMContext &Ctx = SE.getContext(); - auto *BOC = - new VPInstruction(VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}); - SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); + VPBasicBlock *Header = + cast(Plan.getVectorLoopRegion()->getEntry()); + if (all_of(Header->phis(), [](VPRecipeBase &R) { + return !isa(&R); + })) { + for (VPRecipeBase &R : make_early_inc_range(Header->phis())) { + auto *P = cast(&R); + P->replaceAllUsesWith(P->getStartValue()); + P->eraseFromParent(); + } + + VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor(); + auto HeaderSuccs = to_vector(Header->getSuccessors()); + VPBasicBlock *Exiting = + cast(Plan.getVectorLoopRegion()->getExiting()); + + auto *LoopRegion = Plan.getVectorLoopRegion(); + VPBlockBase *Middle = LoopRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Preheader, LoopRegion); + VPBlockUtils::disconnectBlocks(LoopRegion, Middle); + + Header->setParent(nullptr); + Exiting->setParent(nullptr); + VPBlockUtils::connectBlocks(Preheader, Header); + + VPBlockUtils::connectBlocks(Exiting, Middle); + LoopRegion->clearEntry(); + delete LoopRegion; + } else { + LLVMContext &Ctx = SE.getContext(); + auto *BOC = + new VPInstruction(VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}); + + ExitingVPBB->appendRecipe(BOC); + } for (VPValue *Op : PossiblyDead) recursivelyDeleteDeadRecipes(Op); - ExitingVPBB->appendRecipe(BOC); Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible @@ -713,7 +743,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, } /// Sink users of \p FOR after the recipe defining the previous value \p -/// Previous of the recurrence. \returns true if all users of \p FOR could be +// Previous of the recurrence. \returns true if all users of \p FOR could be /// re-arranged as needed or false if it is not possible. static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll index dcb8ba73661667..47886b28439ddb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll @@ -78,17 +78,13 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3) -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8 +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -102,7 +98,7 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -233,6 +229,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index ec50b0cac03829..13ebfa82f1da62 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -49,28 +49,25 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP7]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP10]], ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, [[TMP4]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -79,15 +76,15 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -118,28 +115,25 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP7]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP10]], ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, [[TMP4]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -148,15 +142,15 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -187,28 +181,25 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP7]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP10]], ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, [[TMP4]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -217,15 +208,15 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -254,21 +245,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -285,7 +272,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -315,21 +302,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -346,7 +329,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -390,7 +373,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -407,7 +390,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index bb716d78ca4119..b926633ea9a3df 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -12,18 +12,15 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP4]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP5]], ptr [[TMP6]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i32 0, [[TMP2]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -37,7 +34,7 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -71,18 +68,15 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP7]], ptr [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i32 0, [[TMP4]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -96,7 +90,7 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index fa13cd8f19ae56..6eface402f6df6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -163,20 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> , [[TMP0]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) +; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2) -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -194,7 +186,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -293,7 +285,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -314,7 +306,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -354,8 +346,7 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index fcf1ba072a62cb..c9529703b1ad60 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -14,34 +14,26 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i64 0, 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_ENTRY:.*]] +; CHECK: [[PRED_STORE_ENTRY]]: ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 ; CHECK-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: @@ -52,26 +44,27 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 ; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] ; CHECK: [[PRED_STORE_IF1]]: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: store i32 [[TMP27]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 ; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] ; CHECK: [[PRED_STORE_IF3]]: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: store i32 [[TMP29]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: store i32 [[TMP31]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label %[[THEN_0:.*]] +; CHECK: [[THEN_0]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -93,7 +86,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -125,7 +118,6 @@ exit: } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index 61cae9c1b3f5dc..1e6e38632701ff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -15,17 +15,11 @@ define void @f1() { ; CHECK-NEXT: bb1: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x ptr> , ptr [[TMP3]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = sext i16 0 to i64 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x ptr> , ptr [[TMP2]], align 8 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -40,7 +34,7 @@ define void @f1() { ; CHECK-NEXT: store ptr [[_TMP2]], ptr [[_TMP7]], align 8 ; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 ; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 -; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: bb3: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index cc60359af2f8c5..47662d155b5453 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -14,20 +14,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -43,7 +39,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index c4e3e0b8c5a369..cd91eb9757c0a9 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -3705,15 +3705,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: entry: ; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; UNROLL-NO-IC-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: @@ -3726,7 +3719,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-IC-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: exit: ; UNROLL-NO-IC-NEXT: ret i32 0 ; @@ -3734,13 +3727,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: entry: ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: -; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; UNROLL-NO-VF-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3753,7 +3741,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-VF-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-VF-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-VF: exit: ; UNROLL-NO-VF-NEXT: ret i32 0 ; @@ -3761,15 +3749,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: entry: ; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: -; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] -; SINK-AFTER: vector.body: -; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; SINK-AFTER-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; SINK-AFTER-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; SINK-AFTER-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: @@ -3782,7 +3763,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; SINK-AFTER-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; SINK-AFTER-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; SINK-AFTER: exit: ; SINK-AFTER-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index fd75177c0d1062..c144cf0ca5c52f 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -11,9 +11,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF8UF1: [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec ; VF8UF1-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body ; -; VF8UF2: br i1 true, label %middle.block, label %vector.body +; VF8UF2: br label %middle.block ; -; VF16UF1: br i1 true, label %middle.block, label %vector.body +; VF16UF1: br label %middle.block ; entry: %and = and i64 %N, 15 diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index b3ec3e8f0f3c63..010a7f3b73a215 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -498,18 +498,14 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]] ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[G_64]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 0, [[G_64]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3 ; CHECK-NEXT: store <4 x i16> , ptr [[TMP7]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -522,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: store i16 [[G_16]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -559,6 +555,5 @@ exit: ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;.