/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
- unsigned Cost) {
+ InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
- ElementCount VF, InstWidening W, unsigned Cost) {
+ ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
/// But the cost will be assigned to one instruction only.
/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
- unsigned getWideningCost(Instruction *I, ElementCount VF) {
+ InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
assert(VF.isVector() && "Expected VF >=2");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF. Return the cost of the instruction, including
/// scalarization overhead if it's needed.
- unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
+ InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
- unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize);
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
+ bool &NeedToScalarize);
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
Type *&VectorTy);
/// Calculate vectorization cost of memory instruction \p I.
- unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
+ InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
/// The cost computation for scalarized memory instruction.
- unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
+ InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
/// The cost computation for interleaving group of memory instructions.
- unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
+ InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
/// The cost computation for Gather/Scatter instruction.
- unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
+ InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
/// The cost computation for widening instruction \p I with consecutive
/// memory access.
- unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+ InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
/// The cost calculation for Load/Store instruction \p I with uniform pointer -
/// Load: scalar load + broadcast.
/// Store: scalar store + (loop invariant value stored? 0 : extract of last
/// element)
- unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
+ InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
+ InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
- std::pair<InstWidening, unsigned>>;
+ std::pair<InstWidening, InstructionCost>>;
DecisionList WideningDecisions;
}
}
-unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
- ElementCount VF,
- bool &NeedToScalarize) {
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
+ bool &NeedToScalarize) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
- unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
- TTI::TCK_RecipThroughput);
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
if (VF.isScalar())
return ScalarCallCost;
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
+ InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
- unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+ InstructionCost Cost =
+ ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
return Cost;
// If the corresponding vector cost is cheaper, return its cost.
- unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
- TTI::TCK_RecipThroughput);
+ InstructionCost VectorCallCost =
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
- return VectorCallCost;
+ Cost = VectorCallCost;
}
return Cost;
}
-unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
+ ElementCount VF) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+ InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
+ bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic || !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");
+ assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+ "Cannot have invalid costs while widening");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
Legal->hasStride(I->getOperand(1));
}
-unsigned
+InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) {
assert(VF.isVector() &&
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
// Get the cost of the scalar memory instruction and address computation.
- unsigned Cost =
+ InstructionCost Cost =
VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
// Don't pass *I here, since it is scalar but will actually be part of a
return Cost;
}
-unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
return Cost;
}
-unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+ ElementCount VF) {
assert(Legal->isUniformMemOp(*I));
Type *ValTy = getMemInstValueType(I);
VF.getKnownMinValue() - 1));
}
-unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
TargetTransformInfo::TCK_RecipThroughput, I);
}
-unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+ ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
- unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
return Cost;
}
-unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+ ElementCount VF) {
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF.isScalar()) {
return VectorizationCostTy(C, TypeNotScalarized);
}
-unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
- ElementCount VF) {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ ElementCount VF) {
assert(!VF.isScalable() &&
"cannot compute scalarization overhead for scalable vectorization");
if (VF.isScalar())
return 0;
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
// relying on instcombine to remove them.
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
- unsigned Cost = getUniformMemOpCost(&I, VF);
+ InstructionCost Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
}
// We assume that widening is the best solution when possible.
if (memoryInstructionCanBeWidened(&I, VF)) {
- unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+ InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride =
Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
}
// Choose between Interleaving, Gather/Scatter or Scalarization.
- unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
+ InstructionCost InterleaveCost = std::numeric_limits<int>::max();
unsigned NumAccesses = 1;
if (isAccessInterleaved(&I)) {
auto Group = getInterleavedAccessGroup(&I);
InterleaveCost = getInterleaveGroupCost(&I, VF);
}
- unsigned GatherScatterCost =
+ InstructionCost GatherScatterCost =
isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
- : std::numeric_limits<unsigned>::max();
+ : std::numeric_limits<int>::max();
- unsigned ScalarizationCost =
+ InstructionCost ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;
// Choose better solution for the current VF,
// write down this decision and use it during vectorization.
- unsigned Cost;
+ InstructionCost Cost;
InstWidening Decision;
if (InterleaveCost <= GatherScatterCost &&
InterleaveCost < ScalarizationCost) {
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
if (VF.isVector() && isScalarWithPredication(I)) {
- unsigned Cost = 0;
+ InstructionCost Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
// that we will create. This cost is likely to be zero. The phi node
case Instruction::Call: {
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
- unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
- if (getVectorIntrinsicIDForCall(CI, TLI))
- return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
+ InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+ if (getVectorIntrinsicIDForCall(CI, TLI)) {
+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+ return std::min(CallCost, IntrinsicCost);
+ }
return CallCost;
}
case Instruction::ExtractValue:
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+ InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
+ bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
+ assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+ "Cannot have invalid costs while widening");
return UseVectorIntrinsic || !NeedToScalarize;
};