Skip to content

Commit

Permalink
Addressing suggestions
Browse files Browse the repository at this point in the history
* Fixing comments
* Adding more tests
* Remove cmp latch presence requirements
  • Loading branch information
igogo-x86 committed Nov 15, 2024
1 parent 3a4555b commit fbb939a
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 41 deletions.
42 changes: 19 additions & 23 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2648,29 +2648,21 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
return I->second;
}

/// Knowing that loop \p L would be fully unrolled after vectorisation, add
/// instructions that will get simplified and thus should not have any cost to
/// \p InstsToIgnore
static void AddFullyUnrolledInstructionsToIgnore(
/// Knowing that loop \p L executes a single vector iteration, add instructions
/// that will get simplified and thus should not have any cost to \p
/// InstsToIgnore.
static void addFullyUnrolledInstructionsToIgnore(
Loop *L, const LoopVectorizationLegality::InductionList &IL,
SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
auto *Cmp = L->getLatchCmpInst();
if (!Cmp)
return;
InstsToIgnore.insert(Cmp);
if (Cmp)
InstsToIgnore.insert(Cmp);
for (const auto &[IV, IndDesc] : IL) {
// Get next iteration value of the induction variable
// Get next iteration value of the induction variable.
Instruction *IVInst =
cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
bool IsSimplifiedAway = true;
// Check that this value used only to exit the loop
for (auto *UIV : IVInst->users()) {
if (UIV != IV && UIV != Cmp) {
IsSimplifiedAway = false;
break;
}
}
if (IsSimplifiedAway)
if (all_of(IVInst->users(),
[&](const User *U) { return U == IV || U == Cmp; }))
InstsToIgnore.insert(IVInst);
}
}
Expand Down Expand Up @@ -5561,12 +5553,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost Cost;

// If with the given fixed width VF loop gets fully unrolled, ignore the costs
// of comparison and induction instructions, as they'll get simplified away
// If the vector loop gets executed exactly once with the given VF, ignore the
// costs of comparison and induction instructions, as they'll get simplified
// away.
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (VF.isFixed() && TC == VF.getFixedValue())
AddFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
ValuesToIgnoreForVF);

// For each block.
Expand Down Expand Up @@ -7259,11 +7252,14 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
IVInsts.push_back(CI);
}

// If with the given VF loop gets fully unrolled, ignore the costs of
// comparison and induction instructions, as they'll get simplified away
// If the vector loop gets executed exactly once with the given VF, ignore
// the costs of comparison and induction instructions, as they'll get
// simplified away.
// TODO: Remove this code after stepping away from the legacy cost model and
// adding code to simplify VPlans before calculating their costs.
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
if (VF.isFixed() && TC == VF.getFixedValue())
AddFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
CostCtx.SkipCostComputation);

for (Instruction *IVInst : IVInsts) {
Expand Down
113 changes: 95 additions & 18 deletions llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
Original file line number Diff line number Diff line change
@@ -1,38 +1,115 @@
; REQUIRES: asserts
; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s

target triple="aarch64--linux-gnu"

; This test shows that comparison and next iteration IV have zero cost if the
; vector loop gets executed exactly once with the given VF.
define i64 @test(ptr %a, ptr %b) #0 {
; CHECK: LV: Checking a loop in 'test'
; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK: LV: Vector loop of width 8 costs: 3.
; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK: LV: Vector loop of width 16 costs: 3.
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 26
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 48
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body

for.cond.cleanup: ; preds = %for.body
%add.lcssa = phi i64 [ %add, %for.body ]
ret i64 %add.lcssa
exit: ; preds = %for.body
ret i64 %add

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%sum.09 = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
%i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
%sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
%0 = load i8, ptr %arrayidx, align 1
%conv = zext i8 %0 to i64
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
%mul = mul nuw nsw i64 %conv3, %conv
%add = add i64 %mul, %sum.09
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 16
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
%add = add i64 %mul, %sum
%i.iv.next = add nuw nsw i64 %i.iv, 1
%exitcond.not = icmp eq i64 %i.iv.next, 16
br i1 %exitcond.not, label %exit, label %for.body
}

; Same as above, but in the next iteration IV has extra users, and thus, the cost is not zero.
define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK: LV: Checking a loop in 'test_external_iv_user'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 26
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 49
; CHECK: LV: Selecting VF: vscale x 2
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
%sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
%0 = load i8, ptr %arrayidx, align 1
%conv = zext i8 %0 to i64
%i.iv.next = add nuw nsw i64 %i.iv, 1
%arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
%mul = mul nuw nsw i64 %conv3, %conv
%add = add i64 %sum, %mul
%exitcond.not = icmp eq i64 %i.iv.next, 16
br i1 %exitcond.not, label %exit, label %for.body

exit: ; preds = %for.body
ret i64 %add
}

; Same as above but with two IVs without extra users. They all have zero cost when VF equals the number of iterations.
define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK: LV: Checking a loop in 'test_two_ivs'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 27
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 48
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body

exit: ; preds = %for.body
ret i64 %add

for.body: ; preds = %entry, %for.body
%i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
%j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
%sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
%0 = load i8, ptr %arrayidx, align 1
%conv = zext i8 %0 to i64
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %j.iv
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
%mul = mul nuw nsw i64 %conv3, %conv
%add = add i64 %mul, %sum
%i.iv.next = add nuw nsw i64 %i.iv, 1
%j.iv.next = add nuw nsw i64 %j.iv, 1
%exitcond.not = icmp eq i64 %i.iv.next, 16
br i1 %exitcond.not, label %exit, label %for.body
}

attributes #0 = { vscale_range(1, 16) "target-features"="+sve" }

0 comments on commit fbb939a

Please sign in to comment.