namespace llvm {
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+ CM_ScalarEpilogueAllowed,
+ CM_ScalarEpilogueNotAllowedOptSize,
+ CM_ScalarEpilogueNotAllowedLowTripLoop
+};
+
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
/// different operations.
class LoopVectorizationCostModel {
public:
- LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+ LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+ PredicatedScalarEvolution &PSE,
LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
InterleavedAccessInfo &IAI)
- : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
- AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE),
+ LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE),
+ TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<unsigned> computeMaxVF(bool OptForSize);
+ Optional<unsigned> computeMaxVF();
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
- unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
- unsigned LoopCost);
+ unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
/// to handle accesses with gaps, and there is nothing preventing us from
/// creating a scalar epilogue.
bool requiresScalarEpilogue() const {
- return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+ return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
}
- /// Returns true if a scalar epilogue is not allowed due to optsize.
- bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+ /// Returns true if a scalar epilogue is not allowed due to optsize or a
+ /// loop hint annotation.
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+ }
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
- unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
+ unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
/// Records whether it is allowed to have the original scalar loop execute at
- /// least once. This may be needed as a fallback loop in case runtime
+ /// least once. This may be needed as a fallback loop in case runtime
/// aliasing/dependence checks fail, or to handle the tail/remainder
/// iterations when the trip count is unknown or doesn't divide by the VF,
/// or as a peel-loop to handle gaps in interleave-groups.
/// Under optsize and when the trip count is very small we don't allow any
/// iterations to execute in the scalar loop.
- bool IsScalarEpilogueAllowed = true;
+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
/// All blocks of loop are to be masked to fold tail of scalar iterations.
bool FoldTailByMasking = false;
// Check if masking is required.
// A Group may need masking for one of two reasons: it resides in a block that
// needs predication, or it was decided to use masking to deal with gaps.
- bool PredicatedAccessRequiresMasking =
+ bool PredicatedAccessRequiresMasking =
Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
- bool AccessWithGapsRequiresMasking =
- Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ bool AccessWithGapsRequiresMasking =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
return true;
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
}
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
}
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
- return computeFeasibleMaxVF(OptForSize, TC);
+ if (isScalarEpilogueAllowed())
+ return computeFeasibleMaxVF(TC);
+
+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" <<
+ "LV: Performing code size checks.\n");
if (Legal->getRuntimePointerChecking()->Need) {
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
// Record that scalar epilogue is not allowed.
LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
- IsScalarEpilogueAllowed = !OptForSize;
-
// We don't create an epilogue when optimizing for size.
// Invalidate interleave groups that require an epilogue if we can't mask
// the interleave-group.
- if (!useMaskedInterleavedAccesses(TTI))
+ if (!useMaskedInterleavedAccesses(TTI))
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
- unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
+ unsigned MaxVF = computeFeasibleMaxVF(TC);
if (TC > 0 && TC % MaxVF == 0) {
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
}
unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
- unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
}
unsigned MaxVF = MaxVectorSize;
- if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
- (MaximizeBandwidth && !OptForSize)) {
+ if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+ (MaximizeBandwidth && isScalarEpilogueAllowed())) {
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
SmallVector<unsigned, 8> VFs;
return {MinWidth, MaxWidth};
}
-unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
- unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
unsigned LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.
- // When we optimize for size, we don't interleave.
- if (OptForSize)
+ if (!isScalarEpilogueAllowed())
return 1;
// We used the distance for the interleave count.
}
// Calculate the cost of the whole interleaved group.
- bool UseMaskForGaps =
- Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ bool UseMaskForGaps =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
unsigned Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
}
VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
- unsigned UserVF) {
+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
unsigned VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
return VectorizationFactor::Disabled();
}
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
- unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
assert(OrigLoop->empty() && "Inner loop expected.");
- Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+ Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
return None;
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI);
+
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ (F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
+
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI,
+ DB, AC, ORE, F, &Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();
- // Check the function attributes and profiles to find out if this function
- // should be optimized for size.
- bool OptForSize =
- Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
-
// Plan how to best vectorize, return the best VF and its cost.
- const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
+ const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
// If we are stress testing VPlan builds, do not attempt to generate vector
// code. Masked vector code generation support will follow soon.
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- bool OptForSize =
- Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// Loops with a very small trip count are considered for vectorization
// under OptForSize, thereby making sure the cost of their loop body is
// dominant, free of runtime guards and scalar iteration overheads.
- OptForSize = true;
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
}
}
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI);
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI,
+ DB, AC, ORE, F, &Hints, IAI);
CM.collectValuesToIgnore();
// Use the planner for vectorization.
unsigned UserVF = Hints.getWidth();
// Plan how to best vectorize, return the best VF and its cost.
- Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
+ Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
- IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
}
// Identify the diagnostic messages that should be produced.