[LV] Tail-Loop Folding

author Sjoerd Meijer <sjoerd.meijer@arm.com>

Thu, 1 Aug 2019 18:21:44 +0000 (18:21 +0000)

committer Sjoerd Meijer <sjoerd.meijer@arm.com>

Thu, 1 Aug 2019 18:21:44 +0000 (18:21 +0000)
author Sjoerd Meijer <sjoerd.meijer@arm.com>
Thu, 1 Aug 2019 18:21:44 +0000 (18:21 +0000)
committer Sjoerd Meijer <sjoerd.meijer@arm.com>
Thu, 1 Aug 2019 18:21:44 +0000 (18:21 +0000)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

index b144006..8316677 100644 (file)
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -55,7 +55,8 @@ OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
  /// for example 'force', means a decision has been made. So, we need to be
  /// careful NOT to add them if the user hasn't specifically asked so.
  class LoopVectorizeHints {
-  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
+  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED,
+                  HK_PREDICATE };
  
    /// Hint - associates name and validation with the hint value.
    struct Hint {
@@ -81,6 +82,9 @@ class LoopVectorizeHints {
    /// Already Vectorized
    Hint IsVectorized;
  
+  /// Vector Predicate
+  Hint Predicate;
+
    /// Return the loop metadata prefix.
    static StringRef Prefix() { return "llvm.loop."; }
  
@@ -109,6 +113,7 @@ public:
    unsigned getWidth() const { return Width.Value; }
    unsigned getInterleave() const { return Interleave.Value; }
    unsigned getIsVectorized() const { return IsVectorized.Value; }
+  unsigned getPredicate() const { return Predicate.Value; }
    enum ForceKind getForce() const {
      if ((ForceKind)Force.Value == FK_Undefined &&
          hasDisableAllTransformsHint(TheLoop))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

index 6ef8dc2..cede505 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -88,6 +88,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
    case HK_FORCE:
      return (Val <= 1);
    case HK_ISVECTORIZED:
+  case HK_PREDICATE:
      return (Val == 0 || Val == 1);
    }
    return false;
@@ -99,7 +100,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
      : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
        Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
        Force("vectorize.enable", FK_Undefined, HK_FORCE),
-      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+      IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
+      Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),
+      ORE(ORE) {
    // Populate values with existing loop metadata.
    getHintsFromMetadata();
  
@@ -250,7 +253,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
      return;
    unsigned Val = C->getZExtValue();
  
-  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
    for (auto H : Hints) {
      if (Name == H->Name) {
        if (H->validate(Val))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 62b9ba8..369b29a 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -839,9 +839,21 @@ namespace llvm {
  // Loop vectorization cost-model hints how the scalar epilogue loop should be
  // lowered.
  enum ScalarEpilogueLowering {
+
+  // The default: allowing scalar epilogues.
    CM_ScalarEpilogueAllowed,
+
+  // Vectorization with OptForSize: don't allow epilogues.
    CM_ScalarEpilogueNotAllowedOptSize,
-  CM_ScalarEpilogueNotAllowedLowTripLoop
+
+  // A special case of vectorisation with OptForSize: loops with a very small
+  // trip count are considered for vectorization under OptForSize, thereby
+  // making sure the cost of their loop body is dominant, free of runtime
+  // guards and scalar iteration overheads.
+  CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+  // Loop hint predicate indicating an epilogue is undesired.
+  CM_ScalarEpilogueNotNeededPredicatePragma
  };
  
  /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -854,22 +866,26 @@ enum ScalarEpilogueLowering {
  class LoopVectorizationCostModel {
  public:
    LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
-                             PredicatedScalarEvolution &PSE,
-                             LoopInfo *LI, LoopVectorizationLegality *Legal,
+                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
                               const TargetTransformInfo &TTI,
                               const TargetLibraryInfo *TLI, DemandedBits *DB,
                               AssumptionCache *AC,
                               OptimizationRemarkEmitter *ORE, const Function *F,
                               const LoopVectorizeHints *Hints,
                               InterleavedAccessInfo &IAI)
-      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE),
-    LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE),
-    TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+        Hints(Hints), InterleaveInfo(IAI) {}
  
    /// \return An upper bound for the vectorization factor, or None if
    /// vectorization and interleaving should be avoided up front.
    Optional<unsigned> computeMaxVF();
  
+  /// \return True if runtime checks are required for vectorization, and false
+  /// otherwise.
+  bool runtimeChecksRequired();
+
    /// \return The most profitable vectorization factor and the cost of that VF.
    /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
    /// then this vectorization factor will be selected if vectorization is
@@ -4687,26 +4703,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
    Uniforms[VF].insert(Worklist.begin(), Worklist.end());
  }
  
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
-  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
-    // uniform if the target can skip.
-    LLVM_DEBUG(
-        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
-
-    ORE->emit(
-      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
-      << "runtime pointer checks needed. Not enabled for divergent target");
-
-    return None;
-  }
-
-  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (isScalarEpilogueAllowed())
-    return computeFeasibleMaxVF(TC);
-
-  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" <<
-                       "LV: Performing code size checks.\n");
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
  
    if (Legal->getRuntimePointerChecking()->Need) {
      ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -4716,7 +4714,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
      LLVM_DEBUG(
          dbgs()
          << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return None;
+    return true;
    }
  
    if (!PSE.getUnionPredicate().getPredicates().empty()) {
@@ -4727,7 +4725,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
      LLVM_DEBUG(
          dbgs()
          << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
-    return None;
+    return true;
    }
  
    // FIXME: Avoid specializing for stride==1 instead of bailing out.
@@ -4739,12 +4737,28 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
      LLVM_DEBUG(
          dbgs()
          << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+    return true;
+  }
+
+  return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+    // TODO: It may by useful to do since it's still likely to be dynamically
+    // uniform if the target can skip.
+    LLVM_DEBUG(
+        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+
+    ORE->emit(
+      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
+      << "runtime pointer checks needed. Not enabled for divergent target");
+
      return None;
    }
  
-  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
    LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
    if (TC == 1) {
      ORE->emit(createMissedAnalysis("SingleIterationLoop")
                << "loop trip count is one, irrelevant for vectorization");
@@ -4752,18 +4766,44 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
      return None;
    }
  
-  // Record that scalar epilogue is not allowed.
-  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+  switch (ScalarEpilogueStatus) {
+  default:
+    return None;
+  case CM_ScalarEpilogueAllowed:
+    return computeFeasibleMaxVF(TC);
+  case CM_ScalarEpilogueNotNeededPredicatePragma:
+    LLVM_DEBUG(
+        dbgs() << "LV: vector predicate hint found.\n"
+               << "LV: Not allowing scalar epilogue, creating predicated "
+               << "vector loop.\n");
+    break;
+  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+    // fallthrough as a special case of OptForSize
+  case CM_ScalarEpilogueNotAllowedOptSize:
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+      LLVM_DEBUG(
+          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+    else
+      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+                        << "count.\n");
+
+    // Bail if runtime checks are required, which are not good when optimising
+    // for size.
+    if (runtimeChecksRequired())
+      return None;
+    break;
+  }
+
+  // Now try the tail folding
  
-  // We don't create an epilogue when optimizing for size.
    // Invalidate interleave groups that require an epilogue if we can't mask
    // the interleave-group.
    if (!useMaskedInterleavedAccesses(TTI))
      InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
  
    unsigned MaxVF = computeFeasibleMaxVF(TC);
-
    if (TC > 0 && TC % MaxVF == 0) {
+    // Accept MaxVF if we do not have a tail.
      LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
      return MaxVF;
    }
@@ -7207,6 +7247,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
    State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
  }
  
+static ScalarEpilogueLowering
+getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
+                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+      (F->hasOptSize() ||
+       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+    SEL = CM_ScalarEpilogueNotAllowedOptSize;
+  else if (Hints.getPredicate())
+    SEL = CM_ScalarEpilogueNotNeededPredicatePragma;
+
+  return SEL;
+}
+
  // Process the loop in the VPlan-native vectorization path. This path builds
  // VPlan upfront in the vectorization pipeline, which allows to apply
  // VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7221,15 +7275,10 @@ static bool processLoopInVPlanNativePath(
    assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
    Function *F = L->getHeader()->getParent();
    InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
  
-  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
-  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
-    SEL = CM_ScalarEpilogueNotAllowedOptSize;
-
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI,
-                                DB, AC, ORE, F, &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints, IAI);
    // Use the planner for outer loop vectorization.
    // TODO: CM is not used at this point inside the planner. Turn CM into an
    // optional argument if we don't need it in the future.
@@ -7318,11 +7367,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  
    // Check the function attributes and profiles to find out if this function
    // should be optimized for size.
-  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
-  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
-    SEL = CM_ScalarEpilogueNotAllowedOptSize;
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
  
    // Entrance to the VPlan-native vectorization path. Outer loops are processed
    // here. They may require CFG and instruction level transformations before
@@ -7371,9 +7416,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
        LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
      else {
        LLVM_DEBUG(dbgs() << "\n");
-      // Loops with a very small trip count are considered for vectorization
-      // under OptForSize, thereby making sure the cost of their loop body is
-      // dominant, free of runtime guards and scalar iteration overheads.
        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
      }
    }
@@ -7420,8 +7462,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
    }
  
    // Use the cost model.
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI,
-                                DB, AC, ORE, F, &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+                                F, &Hints, IAI);
    CM.collectValuesToIgnore();
  
    // Use the planner for vectorization.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll

new file mode 100644 (file)

index 0000000..d776738
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
+; CHECK-LABEL: tail_folding_enabled(
+; CHECK:  vector.body:
+; CHECK:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
+; CHECK:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
+; CHECK:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load
+; CHECK:  call void @llvm.masked.store.v8i32.p0v8i32(
+; CHECK:  %index.next = add i64 %index, 8
+; CHECK:  %12 = icmp eq i64 %index.next, 432
+; CHECK:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 430
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6
+}
+
+define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
+; CHECK-LABEL: tail_folding_disabled(
+; CHECK:      vector.body:
+; CHECK-NOT:  @llvm.masked.load.v8i32.p0v8i32(
+; CHECK-NOT:  @llvm.masked.store.v8i32.p0v8i32(
+; CHECK:      br i1 %44, label {{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 430
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
+}
+
+; CHECK:      !0 = distinct !{!0, !1}
+; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
+; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-NEXT: !4 = distinct !{!4, !1}
+; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
+
+attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" }
+
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!10 = distinct !{!10, !11, !12}
+!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
+!12 = !{!"llvm.loop.vectorize.enable", i1 true}
author	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Thu, 1 Aug 2019 18:21:44 +0000 (18:21 +0000)
committer	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Thu, 1 Aug 2019 18:21:44 +0000 (18:21 +0000)
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll	[new file with mode: 0644]	patch \| blob