[LV] Scalar Epilogue Lowering. NFC.

author Sjoerd Meijer <sjoerd.meijer@arm.com>

Thu, 25 Jul 2019 08:06:02 +0000 (08:06 +0000)

committer Sjoerd Meijer <sjoerd.meijer@arm.com>

Thu, 25 Jul 2019 08:06:02 +0000 (08:06 +0000)
author Sjoerd Meijer <sjoerd.meijer@arm.com>
Thu, 25 Jul 2019 08:06:02 +0000 (08:06 +0000)
committer Sjoerd Meijer <sjoerd.meijer@arm.com>
Thu, 25 Jul 2019 08:06:02 +0000 (08:06 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

index 97077cc..a5e85f2 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -228,11 +228,11 @@ public:
  
    /// Plan how to best vectorize, return the best VF and its cost, or None if
    /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF);
+  Optional<VectorizationFactor> plan(unsigned UserVF);
  
    /// Use the VPlan-native path to plan how to best vectorize, return the best
    /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+  VectorizationFactor planInVPlanNativePath(unsigned UserVF);
  
    /// Finalize the best decision and dispose of all other VPlans.
    void setBestPlan(unsigned VF, unsigned UF);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 46265e3..62b9ba8 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -836,6 +836,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
  
  namespace llvm {
  
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+  CM_ScalarEpilogueAllowed,
+  CM_ScalarEpilogueNotAllowedOptSize,
+  CM_ScalarEpilogueNotAllowedLowTripLoop
+};
+
  /// LoopVectorizationCostModel - estimates the expected speedups due to
  /// vectorization.
  /// In many cases vectorization is not profitable. This can happen because of
@@ -845,7 +853,8 @@ namespace llvm {
  /// different operations.
  class LoopVectorizationCostModel {
  public:
-  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+                             PredicatedScalarEvolution &PSE,
                               LoopInfo *LI, LoopVectorizationLegality *Legal,
                               const TargetTransformInfo &TTI,
                               const TargetLibraryInfo *TLI, DemandedBits *DB,
@@ -853,12 +862,13 @@ public:
                               OptimizationRemarkEmitter *ORE, const Function *F,
                               const LoopVectorizeHints *Hints,
                               InterleavedAccessInfo &IAI)
-      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE),
+    LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE),
+    TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
  
    /// \return An upper bound for the vectorization factor, or None if
    /// vectorization and interleaving should be avoided up front.
-  Optional<unsigned> computeMaxVF(bool OptForSize);
+  Optional<unsigned> computeMaxVF();
  
    /// \return The most profitable vectorization factor and the cost of that VF.
    /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -881,8 +891,7 @@ public:
    /// If interleave count has been specified by metadata it will be returned.
    /// Otherwise, the interleave count is computed and returned. VF and LoopCost
    /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
-                                 unsigned LoopCost);
+  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
  
    /// Memory access instruction may be vectorized in more than one way.
    /// Form of instruction after vectorization depends on cost.
@@ -1157,11 +1166,14 @@ public:
    /// to handle accesses with gaps, and there is nothing preventing us from
    /// creating a scalar epilogue.
    bool requiresScalarEpilogue() const {
-    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+    return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
    }
  
-  /// Returns true if a scalar epilogue is not allowed due to optsize.
-  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+  /// Returns true if a scalar epilogue is not allowed due to optsize or a
+  /// loop hint annotation.
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+  }
  
    /// Returns true if all loop blocks should be masked to fold tail loop.
    bool foldTailByMasking() const { return FoldTailByMasking; }
@@ -1187,7 +1199,7 @@ private:
  
    /// \return An upper bound for the vectorization factor, larger than zero.
    /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
+  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
  
    /// The vectorization cost is a combination of the cost itself and a boolean
    /// indicating whether any of the contributing operations will actually
@@ -1270,13 +1282,13 @@ private:
    SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
  
    /// Records whether it is allowed to have the original scalar loop execute at
-  /// least once. This may be needed as a fallback loop in case runtime 
+  /// least once. This may be needed as a fallback loop in case runtime
    /// aliasing/dependence checks fail, or to handle the tail/remainder
    /// iterations when the trip count is unknown or doesn't divide by the VF,
    /// or as a peel-loop to handle gaps in interleave-groups.
    /// Under optsize and when the trip count is very small we don't allow any
    /// iterations to execute in the scalar loop.
-  bool IsScalarEpilogueAllowed = true;
+  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
  
    /// All blocks of loop are to be masked to fold tail of scalar iterations.
    bool FoldTailByMasking = false;
@@ -4452,10 +4464,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
    // Check if masking is required.
    // A Group may need masking for one of two reasons: it resides in a block that
    // needs predication, or it was decided to use masking to deal with gaps.
-  bool PredicatedAccessRequiresMasking = 
+  bool PredicatedAccessRequiresMasking =
        Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
-  bool AccessWithGapsRequiresMasking = 
-      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  bool AccessWithGapsRequiresMasking =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
    if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
      return true;
  
@@ -4675,7 +4687,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
    Uniforms[VF].insert(Worklist.begin(), Worklist.end());
  }
  
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
    if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
      // TODO: It may by useful to do since it's still likely to be dynamically
      // uniform if the target can skip.
@@ -4690,8 +4702,11 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
    }
  
    unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
-    return computeFeasibleMaxVF(OptForSize, TC);
+  if (isScalarEpilogueAllowed())
+    return computeFeasibleMaxVF(TC);
+
+  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" <<
+                       "LV: Performing code size checks.\n");
  
    if (Legal->getRuntimePointerChecking()->Need) {
      ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -4740,15 +4755,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
    // Record that scalar epilogue is not allowed.
    LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
  
-  IsScalarEpilogueAllowed = !OptForSize;
-
    // We don't create an epilogue when optimizing for size.
    // Invalidate interleave groups that require an epilogue if we can't mask
    // the interleave-group.
-  if (!useMaskedInterleavedAccesses(TTI)) 
+  if (!useMaskedInterleavedAccesses(TTI))
      InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
  
-  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
+  unsigned MaxVF = computeFeasibleMaxVF(TC);
  
    if (TC > 0 && TC % MaxVF == 0) {
      LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
@@ -4779,8 +4792,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
  }
  
  unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
-                                                 unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
    MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
    unsigned SmallestType, WidestType;
    std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -4818,8 +4830,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
    }
  
    unsigned MaxVF = MaxVectorSize;
-  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
-      (MaximizeBandwidth && !OptForSize)) {
+  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
      // Collect all viable vectorization factors larger than the default MaxVF
      // (i.e. MaxVectorSize).
      SmallVector<unsigned, 8> VFs;
@@ -4958,8 +4970,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
    return {MinWidth, MaxWidth};
  }
  
-unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
-                                                           unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
                                                             unsigned LoopCost) {
    // -- The interleave heuristics --
    // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4975,8 +4986,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
    // 3. We don't interleave if we think that we will spill registers to memory
    // due to the increased register pressure.
  
-  // When we optimize for size, we don't interleave.
-  if (OptForSize)
+  if (!isScalarEpilogueAllowed())
      return 1;
  
    // We used the distance for the interleave count.
@@ -5626,8 +5636,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
    }
  
    // Calculate the cost of the whole interleaved group.
-  bool UseMaskForGaps = 
-      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  bool UseMaskForGaps =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
    unsigned Cost = TTI.getInterleavedMemoryOpCost(
        I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
        Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -6167,8 +6177,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
  }
  
  VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
-                                                unsigned UserVF) {
+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
    unsigned VF = UserVF;
    // Outer loop handling: They may require CFG and instruction level
    // transformations before even evaluating whether vectorization is profitable.
@@ -6207,10 +6216,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
    return VectorizationFactor::Disabled();
  }
  
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
-                                                             unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
    assert(OrigLoop->empty() && "Inner loop expected.");
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
    if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
      return None;
  
@@ -7213,8 +7221,15 @@ static bool processLoopInVPlanNativePath(
    assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
    Function *F = L->getHeader()->getParent();
    InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
-  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+
+  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+      (F->hasOptSize() ||
+       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+    SEL = CM_ScalarEpilogueNotAllowedOptSize;
+
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI,
+                                DB, AC, ORE, F, &Hints, IAI);
    // Use the planner for outer loop vectorization.
    // TODO: CM is not used at this point inside the planner. Turn CM into an
    // optional argument if we don't need it in the future.
@@ -7223,15 +7238,8 @@ static bool processLoopInVPlanNativePath(
    // Get user vectorization factor.
    const unsigned UserVF = Hints.getWidth();
  
-  // Check the function attributes and profiles to find out if this function
-  // should be optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
-
    // Plan how to best vectorize, return the best VF and its cost.
-  const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
  
    // If we are stress testing VPlan builds, do not attempt to generate vector
    // code. Masked vector code generation support will follow soon.
@@ -7310,10 +7318,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  
    // Check the function attributes and profiles to find out if this function
    // should be optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
        (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
+       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+    SEL = CM_ScalarEpilogueNotAllowedOptSize;
  
    // Entrance to the VPlan-native vectorization path. Outer loops are processed
    // here. They may require CFG and instruction level transformations before
@@ -7365,7 +7374,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
        // Loops with a very small trip count are considered for vectorization
        // under OptForSize, thereby making sure the cost of their loop body is
        // dominant, free of runtime guards and scalar iteration overheads.
-      OptForSize = true;
+      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
      }
    }
  
@@ -7411,8 +7420,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
    }
  
    // Use the cost model.
-  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI,
+                                DB, AC, ORE, F, &Hints, IAI);
    CM.collectValuesToIgnore();
  
    // Use the planner for vectorization.
@@ -7422,7 +7431,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
    unsigned UserVF = Hints.getWidth();
  
    // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
  
    VectorizationFactor VF = VectorizationFactor::Disabled();
    unsigned IC = 1;
@@ -7431,7 +7440,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
    if (MaybeVF) {
      VF = *MaybeVF;
      // Select the interleave count.
-    IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
    }
  
    // Identify the diagnostic messages that should be produced.
author	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Thu, 25 Jul 2019 08:06:02 +0000 (08:06 +0000)
committer	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Thu, 25 Jul 2019 08:06:02 +0000 (08:06 +0000)
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history