[LV] Fallback strategies if tail-folding fails

author Sjoerd Meijer <sjoerd.meijer@arm.com>

Wed, 26 Aug 2020 15:55:25 +0000 (16:55 +0100)

committer Sjoerd Meijer <sjoerd.meijer@arm.com>

Wed, 26 Aug 2020 15:55:25 +0000 (16:55 +0100)
author Sjoerd Meijer <sjoerd.meijer@arm.com>
Wed, 26 Aug 2020 15:55:25 +0000 (16:55 +0100)
committer Sjoerd Meijer <sjoerd.meijer@arm.com>
Wed, 26 Aug 2020 15:55:25 +0000 (16:55 +0100)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

index 7235aa5..46d1071 100644 (file)
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -230,6 +230,7 @@ public:
  
    /// Return true if we can vectorize this loop while folding its tail by
    /// masking, and mark all respective loads/stores for masking.
+  /// This object's state is only modified iff this function returns true.
    bool prepareToFoldTailByMasking();
  
    /// Returns the primary induction variable.
@@ -370,8 +371,14 @@ private:
    /// its original trip-count, under a proper guard, which should be preserved.
    /// \p SafePtrs is a list of addresses that are known to be legal and we know
    /// that we can read from them without segfault.
+  /// \p MaskedOp is a list of instructions that have to be transformed into
+  /// calls to the appropriate masked intrinsic when the loop is vectorized.
+  /// \p ConditionalAssumes is a list of assume instructions in predicated
+  /// blocks that must be dropped if the CFG gets flattened.
    bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
-                            bool PreserveGuards = false);
+                            SmallPtrSetImpl<const Instruction *> &MaskedOp,
+                            SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
+                            bool PreserveGuards = false) const;
  
    /// Updates the vectorization state by adding \p Phi to the inductions list.
    /// This can set \p Phi as the main induction of the loop if \p Phi is a
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

index e2a95d6..157620c 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -919,7 +919,10 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
  }
  
  bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+    SmallPtrSetImpl<const Instruction *> &MaskedOp,
+    SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
+    bool PreserveGuards) const {
    const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
  
    for (Instruction &I : *BB) {
@@ -1026,7 +1029,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
  
      // We must be able to predicate all blocks that need to be predicated.
      if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointers)) {
+      if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
+                                ConditionalAssumes)) {
          reportVectorizationFailure(
              "Control flow cannot be substituted for a select",
              "control flow cannot be substituted for a select",
@@ -1253,10 +1257,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
        Instruction *UI = cast<Instruction>(U);
        if (TheLoop->contains(UI))
          continue;
-      reportVectorizationFailure(
-          "Cannot fold tail by masking, loop has an outside user for",
-          "Cannot fold tail by masking in the presence of live outs.",
-          "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Cannot fold tail by masking, loop has an outside user for "
+          << *UI << "\n");
        return false;
      }
    }
@@ -1264,20 +1268,26 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
    // The list of pointers that we can safely read and write to remains empty.
    SmallPtrSet<Value *, 8> SafePointers;
  
+  SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
+  SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
+
    // Check and mark all blocks for predication, including those that ordinarily
    // do not need predication such as the header block.
    for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
-      reportVectorizationFailure(
-          "Cannot fold tail by masking as required",
-          "control flow cannot be substituted for a select",
-          "NoCFGForSelect", ORE, TheLoop,
-          BB->getTerminator());
+    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
+                              TmpConditionalAssumes,
+                              /* MaskAllLoads= */ true)) {
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
        return false;
      }
    }
  
    LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+
+  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
+  ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
+                            TmpConditionalAssumes.end());
+
    return true;
  }
  
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 884b157..b76629c 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -178,13 +178,36 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
               "value are vectorized only if no scalar iteration overheads "
               "are incurred."));
  
-// Indicates that an epilogue is undesired, predication is preferred.
-// This means that the vectorizer will try to fold the loop-tail (epilogue)
-// into the loop and predicate the loop body accordingly.
-static cl::opt<bool> PreferPredicateOverEpilog(
-    "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
-    cl::desc("Indicate that an epilogue is undesired, predication should be "
-             "used instead."));
+// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
+// that predication is preferred, and this lists all options. I.e., the
+// vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+namespace PreferPredicateTy {
+  enum Option {
+    ScalarEpilogue = 0,
+    PredicateElseScalarEpilogue,
+    PredicateOrDontVectorize
+  };
+}
+
+static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
+    "prefer-predicate-over-epilogue",
+    cl::init(PreferPredicateTy::ScalarEpilogue),
+    cl::Hidden,
+    cl::desc("Tail-folding and predication preferences over creating a scalar "
+             "epilogue loop."),
+    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
+                         "scalar-epilogue",
+                         "Don't tail-predicate loops, create scalar epilogue"),
+              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                         "predicate-else-scalar-epilogue",
+                         "prefer tail-folding, create scalar epilogue if tail "
+                         "folding fails."),
+              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                         "predicate-dont-vectorize",
+                         "prefers tail-folding, don't attempt vectorization if "
+                         "tail-folding fails.")));
  
  static cl::opt<bool> MaximizeBandwidth(
      "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -196,7 +219,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
      cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
  
  /// An interleave-group may need masking if it resides in a block that needs
-/// predication, or in order to mask away gaps. 
+/// predication, or in order to mask away gaps.
  static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
      "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
      cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -5241,6 +5264,19 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
      return MaxVF;
    }
  
+  // If there was a tail-folding hint/switch, but we can't fold the tail by
+  // masking, fallback to a vectorization with a scalar epilogue.
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+    if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
+      LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
+      return None;
+    }
+    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+                         "scalar epilogue instead.\n");
+    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    return MaxVF;
+  }
+
    if (TC == 0) {
      reportVectorizationFailure(
          "Unable to calculate the loop count due to complex control flow",
@@ -8055,8 +8091,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
                            Hints.getForce() != LoopVectorizeHints::FK_Enabled))
      return CM_ScalarEpilogueNotAllowedOptSize;
  
-  bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
-                              !PreferPredicateOverEpilog;
+  bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
+                              !PreferPredicateOverEpilogue;
  
    // 2) Next, if disabling predication is requested on the command line, honour
    // this and request a scalar epilogue.
@@ -8065,8 +8101,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
  
    // 3) and 4) look if enabling predication is requested on the command line,
    // with a loop hint, or if the TTI hook indicates this is profitable, request
-  // predication .
-  if (PreferPredicateOverEpilog ||
+  // predication.
+  if (PreferPredicateOverEpilogue ||
        Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
        (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
                                          LVL.getLAI()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

index f4961e4..dc93a13 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@@ -31,13 +31,13 @@
  ; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
  
  ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
-; RUN:   -prefer-predicate-over-epilog=false \
+; RUN:   -prefer-predicate-over-epilogue=scalar-epilogue \
  ; RUN:   -tail-predication=enabled -loop-vectorize \
  ; RUN:   -enable-arm-maskedldst=true -S < %s | \
  ; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
  
  ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
-; RUN:   -prefer-predicate-over-epilog=true \
+; RUN:   -prefer-predicate-over-epilogue=predicate-dont-vectorize \
  ; RUN:   -tail-predication=enabled -loop-vectorize \
  ; RUN:   -enable-arm-maskedldst=true -S < %s | \
  ; RUN:   FileCheck %s -check-prefixes=CHECK,FOLDING-OPT
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll

index 493c540..cc260b0 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -1,5 +1,6 @@
  ; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=COMMON,DEFAULT
-; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
+; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
+; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
  ; RUN: opt < %s -loop-vectorize -tail-predication=enabled -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-ENABLE-TP
  
  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll

index f6eacf2..605d82c 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
@@ -1,8 +1,9 @@
  ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
  ; RUN:  FileCheck %s -check-prefix=CHECK
  
-; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog -S < %s | \
-; RUN:   FileCheck -check-prefix=PREDFLAG %s
+; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled \
+; RUN:     -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | \
+; RUN:     FileCheck -check-prefix=PREDFLAG %s
  
  ; This test has a loop hint "predicate.predicate" set to false, so shouldn't
  ; get tail-folded, except with -prefer-predicate-over-epilog which then
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll

index 58eca2c..f08e648 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
@@ -1,5 +1,5 @@
  ; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT
-; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=TAILPRED
+; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED
  
  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
  target triple = "thumbv8.1m.main-arm-none-eabi"
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll

new file mode 100644 (file)

index 0000000..7c24d75
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -tail-predication=disabled< %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -tail-predication=enabled < %s | FileCheck %s
+
+; This test should produce the same result (vectorized loop + scalar epilogue) with
+; default options and when MVE Tail Predication is enabled, as this loop's tail cannot be folded
+; by masking due to an outside user of %incdec.ptr in %end.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @outside_user_blocks_tail_folding(
+; CHECK-NEXT:  header:
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2
+; CHECK:       end:
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4
+; CHECK-NEXT:    ret void
+;
+header:
+  %ptr0 = load i8*, i8** %pos, align 4
+  br label %body
+
+body:
+  %dec66 = phi i32 [ %dec, %body ], [ %size, %header ]
+  %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %0 = load i8, i8* %incdec.ptr, align 1
+  store i8 %0, i8* %buff, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end, label %body
+
+end:
+  store i8* %incdec.ptr, i8** %pos, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll

index 564f5cc..62e7f80 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll
+++ b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-vectorize -hexagon-autohvx=1 -force-vector-width=64 -prefer-predicate-over-epilog -S %s | FileCheck %s
+; RUN: opt -loop-vectorize -hexagon-autohvx=1 -force-vector-width=64 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck %s
  
  target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
  target triple = "hexagon"
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll

index 8852513..f405b87 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -1,6 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
  ; RUN: opt < %s -loop-vectorize -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll

index 4fe0d12..8e007a6 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
@@ -1,5 +1,5 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
  
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  
diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll

index 98f394e..f7acb2f 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
@@ -1,5 +1,5 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-vectorize -force-vector-width=2 -S -prefer-predicate-over-epilog %s | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize %s | FileCheck %s
  
  
  ; Test case for PR46525. There are two candidates to pick for
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll

index cb29395..b8a4d16 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -1,5 +1,5 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilog -prefer-predicated-reduction-select -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
  
  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
  
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll

index a090b15..4601903 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-width=4 -S | FileCheck %s
  
  ; Check that a counting-down loop which has no primary induction variable
  ; is vectorized with preferred predication.
diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll

new file mode 100644 (file)

index 0000000..289f5b6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+; This tests should produce the same result as with default options, and when tail folding
+; is preferred, because the vectorizer can't fold the tail by masking (due to an
+; outside user of %incdec.ptr in %end) and should fallback to a scalar epilogue.
+;
+; The first test (@basic_loop) simply relies on the command-line switches.
+; The second test (@metadata) specificies its tail-folding preference via metadata.
+; Both tests should always generate a scalar epilogue.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @basic_loop(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @basic_loop(
+; CHECK-NEXT:  header:
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2
+; CHECK:       end:
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4
+; CHECK-NEXT:    ret void
+;
+header:
+  %ptr0 = load i8*, i8** %pos, align 4
+  br label %body
+
+body:
+  %dec66 = phi i32 [ %dec, %body ], [ %size, %header ]
+  %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %0 = load i8, i8* %incdec.ptr, align 1
+  store i8 %0, i8* %buff, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end, label %body
+
+end:
+  store i8* %incdec.ptr, i8** %pos, align 4
+  ret void
+}
+
+define void @metadata(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @metadata(
+; CHECK-NEXT:  header:
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !5
+; CHECK:       end:
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4
+; CHECK-NEXT:    ret void
+;
+header:
+  %ptr0 = load i8*, i8** %pos, align 4
+  br label %body
+
+body:
+  %dec66 = phi i32 [ %dec, %body ], [ %size, %header ]
+  %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %0 = load i8, i8* %incdec.ptr, align 1
+  store i8 %0, i8* %buff, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end, label %body, !llvm.loop !1
+
+end:
+  store i8* %incdec.ptr, i8** %pos, align 4
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
author	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Wed, 26 Aug 2020 15:55:25 +0000 (16:55 +0100)
committer	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Wed, 26 Aug 2020 15:55:25 +0000 (16:55 +0100)
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/reduction-predselect.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll	[new file with mode: 0644]	patch \| blob