Skip to content

Commit

Permalink
[SLP][NFC]Extract main part of GetGEPCostDiff to a function, NFC.
Browse files Browse the repository at this point in the history
  • Loading branch information
alexey-bataev committed Feb 6, 2024
1 parent e5638c5 commit 36e8db7
Showing 1 changed file with 80 additions and 70 deletions.
150 changes: 80 additions & 70 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6954,6 +6954,82 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
}

/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
static std::pair<InstructionCost, InstructionCost>
getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
Type *ScalarTy, VectorType *VecTy) {
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
// Here we differentiate two cases: (1) when Ptrs represent a regular
// vectorization tree node (as they are pointer arguments of scattered
// loads) or (2) when Ptrs are the arguments of loads or stores being
// vectorized as plane wide unit-stride load/store since all the
// loads/stores are known to be from/to adjacent locations.
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
// Case 2: estimate costs for pointer related costs when vectorizing to
// a wide load/store.
// Scalar cost is estimated as a set of pointers with known relationship
// between them.
// For vector code we will use BasePtr as argument for the wide load/store
// but we also need to account all the instructions which are going to
// stay in vectorized code due to uses outside of these scalar
// loads/stores.
ScalarCost = TTI.getPointersChainCost(
Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
CostKind);

SmallVector<const Value *> PtrsRetainedInVecCode;
for (Value *V : Ptrs) {
if (V == BasePtr) {
PtrsRetainedInVecCode.push_back(V);
continue;
}
auto *Ptr = dyn_cast<GetElementPtrInst>(V);
// For simplicity assume Ptr to stay in vectorized code if it's not a
// GEP instruction. We don't care since it's cost considered free.
// TODO: We should check for any uses outside of vectorizable tree
// rather than just single use.
if (!Ptr || !Ptr->hasOneUse())
PtrsRetainedInVecCode.push_back(V);
}

if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
// If all pointers stay in vectorized code then we don't have
// any savings on that.
return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
}
VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
TTI::PointersChainInfo::getKnownStride(),
VecTy, CostKind);
} else {
// Case 1: Ptrs are the arguments of loads that we are going to transform
// into masked gather load intrinsic.
// All the scalar GEPs will be removed as a result of vectorization.
// For any external uses of some lanes extract element instructions will
// be generated (which cost is estimated separately).
TTI::PointersChainInfo PtrsInfo =
all_of(Ptrs,
[](const Value *V) {
auto *Ptr = dyn_cast<GetElementPtrInst>(V);
return Ptr && !Ptr->hasAllConstantIndices();
})
? TTI::PointersChainInfo::getUnknownStride()
: TTI::PointersChainInfo::getKnownStride();

ScalarCost =
TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
SmallVector<const Value *> Indices(BaseGEP->indices());
VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
BaseGEP->getPointerOperand(), Indices, VecTy,
CostKind);
}
}

return std::make_pair(ScalarCost, VecCost);
}

/// Merges shuffle masks and emits final shuffle instruction, if required. It
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
/// when the actual shuffle instruction is generated only if this is actually
Expand Down Expand Up @@ -7917,78 +7993,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Calculate cost difference from vectorizing set of GEPs.
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
// Here we differentiate two cases: (1) when Ptrs represent a regular
// vectorization tree node (as they are pointer arguments of scattered
// loads) or (2) when Ptrs are the arguments of loads or stores being
// vectorized as plane wide unit-stride load/store since all the
// loads/stores are known to be from/to adjacent locations.
assert(E->State == TreeEntry::Vectorize &&
"Entry state expected to be Vectorize here.");
if (isa<LoadInst, StoreInst>(VL0)) {
// Case 2: estimate costs for pointer related costs when vectorizing to
// a wide load/store.
// Scalar cost is estimated as a set of pointers with known relationship
// between them.
// For vector code we will use BasePtr as argument for the wide load/store
// but we also need to account all the instructions which are going to
// stay in vectorized code due to uses outside of these scalar
// loads/stores.
ScalarCost = TTI->getPointersChainCost(
Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
CostKind);

SmallVector<const Value *> PtrsRetainedInVecCode;
for (Value *V : Ptrs) {
if (V == BasePtr) {
PtrsRetainedInVecCode.push_back(V);
continue;
}
auto *Ptr = dyn_cast<GetElementPtrInst>(V);
// For simplicity assume Ptr to stay in vectorized code if it's not a
// GEP instruction. We don't care since it's cost considered free.
// TODO: We should check for any uses outside of vectorizable tree
// rather than just single use.
if (!Ptr || !Ptr->hasOneUse())
PtrsRetainedInVecCode.push_back(V);
}

if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
// If all pointers stay in vectorized code then we don't have
// any savings on that.
LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost,
"Calculated GEPs cost for Tree"));
return InstructionCost{TTI::TCC_Free};
}
VecCost = TTI->getPointersChainCost(
PtrsRetainedInVecCode, BasePtr,
TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);
} else {
// Case 1: Ptrs are the arguments of loads that we are going to transform
// into masked gather load intrinsic.
// All the scalar GEPs will be removed as a result of vectorization.
// For any external uses of some lanes extract element instructions will
// be generated (which cost is estimated separately).
TTI::PointersChainInfo PtrsInfo =
all_of(Ptrs,
[](const Value *V) {
auto *Ptr = dyn_cast<GetElementPtrInst>(V);
return Ptr && !Ptr->hasAllConstantIndices();
})
? TTI::PointersChainInfo::getUnknownStride()
: TTI::PointersChainInfo::getKnownStride();

ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
SmallVector<const Value *> Indices(BaseGEP->indices());
VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(),
BaseGEP->getPointerOperand(), Indices, VecTy,
CostKind);
}
}

InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
std::tie(ScalarCost, VecCost) = getGEPCosts(
*TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
"Calculated GEPs cost for Tree"));

Expand Down

0 comments on commit 36e8db7

Please sign in to comment.