[LV] Epilogue Vectorization with Optimal Control Flow

author Bardia Mahjour <bmahjour@ca.ibm.com>

Tue, 1 Dec 2020 16:57:16 +0000 (11:57 -0500)

committer Bardia Mahjour <bmahjour@ca.ibm.com>

Tue, 1 Dec 2020 17:04:29 +0000 (12:04 -0500)
author Bardia Mahjour <bmahjour@ca.ibm.com>
Tue, 1 Dec 2020 16:57:16 +0000 (11:57 -0500)
committer Bardia Mahjour <bmahjour@ca.ibm.com>
Tue, 1 Dec 2020 17:04:29 +0000 (12:04 -0500)
diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst

index 702090447c69292bb25fb7b0539845652dab2605..4ea3246c65f68ec35aeeba627e0845de949ae0f2 100644 (file)
--- a/llvm/docs/Vectorizers.rst
+++ b/llvm/docs/Vectorizers.rst
@@ -370,6 +370,25 @@ to be used simultaneously.
  The Loop Vectorizer uses a cost model to decide when it is profitable to unroll loops.
  The decision to unroll the loop depends on the register pressure and the generated code size. 
  
+Epilogue Vectorization
+^^^^^^^^^^^^^^^^^^^^^^
+
+When vectorizing a loop, often a scalar remainder (epilogue) loop is necessary
+to execute tail iterations of the loop if the loop trip count is unknown or it
+does not evenly divide the vectorization and unroll factors. When the
+vectorization and unroll factors are large, it's possible for loops with smaller
+trip counts to end up spending most of their time in the scalar (rather than
+the vector) code. In order to address this issue, the inner loop vectorizer is
+enhanced with a feature that allows it to vectorize epilogue loops with a
+vectorization and unroll factor combination that makes it more likely for small
+trip count loops to still execute in vectorized code. The diagram below shows
+the CFG for a typical epilogue vectorized loop with runtime checks. As
+illustrated the control flow is structured in a way that avoids duplicating the
+runtime pointer checks and optimizes the path length for loops that have very
+small trip counts.
+
+.. image:: epilogue-vectorization-cfg.png
+
  Performance
  -----------
  
diff --git a/llvm/docs/epilogue-vectorization-cfg.png b/llvm/docs/epilogue-vectorization-cfg.png

new file mode 100644 (file)

index 0000000..1dbb269

Binary files /dev/null and b/llvm/docs/epilogue-vectorization-cfg.png differ
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

index ec88bebe684d7276bf29eb7b8a87bb98aa5f1219..a61a1454aee1d1b36420d20ae6f19ff32b53585b 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,10 @@ struct VectorizationFactor {
    bool operator==(const VectorizationFactor &rhs) const {
      return Width == rhs.Width && Cost == rhs.Cost;
    }
+
+  bool operator!=(const VectorizationFactor &rhs) const {
+    return !(*this == rhs);
+  }
  };
  
  /// Planner drives the vectorization process after having passed
@@ -265,6 +269,18 @@ public:
        O << *Plan;
    }
  
+  /// Look through the existing plans and return true if we have one with all
+  /// the vectorization factors in question.
+  bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
+    return any_of(VPlans, [&](const VPlanPtr &Plan) {
+      return all_of(VFs, [&](const ElementCount &VF) {
+        if (Plan->hasVF(VF))
+          return true;
+        return false;
+      });
+    });
+  }
+
    /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
    /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
    /// returned value holds for the entire \p Range.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index d87938bb14640d7bbfa4d9321afc1a7c443d228b..347d030ef28eb458d2b35fba06caab60753a9e2c 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -156,6 +156,7 @@ using namespace llvm;
  
  #define LV_NAME "loop-vectorize"
  #define DEBUG_TYPE LV_NAME
+static const char *VerboseDebug = DEBUG_TYPE "-verbose";
  
  /// @{
  /// Metadata attribute names
@@ -169,6 +170,22 @@ static const char *const LLVMLoopVectorizeFollowupEpilogue =
  
  STATISTIC(LoopsVectorized, "Number of loops vectorized");
  STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+
+static cl::opt<bool> EnableEpilogueVectorization(
+    "enable-epilogue-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of epilogue loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationForceVF(
+    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
+    cl::desc("When epilogue vectorization is enabled, and a value greater than "
+             "1 is specified, forces the given VF for all applicable epilogue "
+             "loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationMinVF(
+    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+    cl::desc("Only loops with vectorization factor equal to or larger than "
+             "the specified value are considered for epilogue vectorization."));
  
  /// Loops with a known constant trip count below this number are vectorized only
  /// if no scalar iteration overheads are incurred.
@@ -458,7 +475,9 @@ public:
    /// is generated around the vectorized (and scalar epilogue) loops consisting
    /// of various checks and bypasses. Return the pre-header block of the new
    /// loop.
-  BasicBlock *createVectorizedLoopSkeleton();
+  /// In the case of epilogue vectorization, this function is overriden to
+  /// handle the more complex control flow around the loops.
+  virtual BasicBlock *createVectorizedLoopSkeleton();
  
    /// Widen a single instruction within the innermost loop.
    void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
@@ -721,7 +740,13 @@ protected:
    /// Create new phi nodes for the induction variables to resume iteration count
    /// in the scalar epilogue, from where the vectorized loop left off (given by
    /// \p VectorTripCount).
-  void createInductionResumeValues(Loop *L, Value *VectorTripCount);
+  /// In cases where the loop skeleton is more complicated (eg. epilogue
+  /// vectorization) and the resume values can come from an additional bypass
+  /// block, the \p AdditionalBypass pair provides information about the bypass
+  /// block and the end value on the edge from bypass to this loop.
+  void createInductionResumeValues(
+      Loop *L, Value *VectorTripCount,
+      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
  
    /// Complete the loop skeleton by adding debug MDs, creating appropriate
    /// conditional branches in the middle block, preparing the builder and
@@ -747,6 +772,11 @@ protected:
    /// vector of instructions.
    void addMetadata(ArrayRef<Value *> To, Instruction *From);
  
+  /// Allow subclasses to override and print debug traces before/after vplan
+  /// execution, when trace information is requested.
+  virtual void printDebugTracesAtStart(){};
+  virtual void printDebugTracesAtEnd(){};
+
    /// The original loop.
    Loop *OrigLoop;
  
@@ -886,6 +916,128 @@ private:
    Value *reverseVector(Value *Vec) override;
  };
  
+/// Encapsulate information regarding vectorization of a loop and its epilogue.
+/// This information is meant to be updated and used across two stages of
+/// epilogue vectorization.
+struct EpilogueLoopVectorizationInfo {
+  ElementCount MainLoopVF = ElementCount::getFixed(0);
+  unsigned MainLoopUF = 0;
+  ElementCount EpilogueVF = ElementCount::getFixed(0);
+  unsigned EpilogueUF = 0;
+  BasicBlock *MainLoopIterationCountCheck = nullptr;
+  BasicBlock *EpilogueIterationCountCheck = nullptr;
+  BasicBlock *SCEVSafetyCheck = nullptr;
+  BasicBlock *MemSafetyCheck = nullptr;
+  Value *TripCount = nullptr;
+  Value *VectorTripCount = nullptr;
+
+  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
+                                unsigned EUF)
+      : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
+        EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
+    assert(EUF == 1 &&
+           "A high UF for the epilogue loop is likely not beneficial.");
+  }
+};
+
+/// An extension of the inner loop vectorizer that creates a skeleton for a
+/// vectorized loop that has its epilogue (residual) also vectorized.
+/// The idea is to run the vplan on a given loop twice, firstly to setup the
+/// skeleton and vectorize the main loop, and secondly to complete the skeleton
+/// from the first step and vectorize the epilogue.  This is achieved by
+/// deriving two concrete strategy classes from this base class and invoking
+/// them in succession from the loop vectorizer planner.
+class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
+public:
+  InnerLoopAndEpilogueVectorizer(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetLibraryInfo *TLI,
+      const TargetTransformInfo *TTI, AssumptionCache *AC,
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                            EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
+        EPI(EPI) {}
+
+  // Override this function to handle the more complex control flow around the
+  // three loops.
+  BasicBlock *createVectorizedLoopSkeleton() final override {
+    return createEpilogueVectorizedLoopSkeleton();
+  }
+
+  /// The interface for creating a vectorized skeleton using one of two
+  /// different strategies, each corresponding to one execution of the vplan
+  /// as described above.
+  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+
+  /// Holds and updates state information required to vectorize the main loop
+  /// and its epilogue in two separate passes. This setup helps us avoid
+  /// regenerating and recomputing runtime safety checks. It also helps us to
+  /// shorten the iteration-count-check path length for the cases where the
+  /// iteration count of the loop is so small that the main vector loop is
+  /// completely skipped.
+  EpilogueLoopVectorizationInfo &EPI;
+};
+
+/// A specialized derived class of inner loop vectorizer that performs
+/// vectorization of *main* loops in the process of vectorizing loops and their
+/// epilogues.
+class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+  EpilogueVectorizerMainLoop(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetLibraryInfo *TLI,
+      const TargetTransformInfo *TTI, AssumptionCache *AC,
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                                       EPI, LVL, CM, BFI, PSI) {}
+  /// Implements the interface for creating a vectorized skeleton using the
+  /// *main loop* strategy (ie the first pass of vplan execution).
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+  /// Emits an iteration count bypass check once for the main loop (when \p
+  /// ForEpilogue is false) and once for the epilogue loop (when \p
+  /// ForEpilogue is true).
+  BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
+                                             bool ForEpilogue);
+  void printDebugTracesAtStart() override;
+  void printDebugTracesAtEnd() override;
+};
+
+// A specialized derived class of inner loop vectorizer that performs
+// vectorization of *epilogue* loops in the process of vectorizing loops and
+// their epilogues.
+class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+  EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE,
+                    EpilogueLoopVectorizationInfo &EPI,
+                    LoopVectorizationLegality *LVL,
+                    llvm::LoopVectorizationCostModel *CM,
+                    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                                       EPI, LVL, CM, BFI, PSI) {}
+  /// Implements the interface for creating a vectorized skeleton using the
+  /// *epilogue loop* strategy (ie the second pass of vplan execution).
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+  /// Emits an iteration count bypass check after the main vector loop has
+  /// finished to see if there are any iterations left to execute by either
+  /// the vector epilogue or the scalar epilogue.
+  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+                                                      BasicBlock *Bypass,
+                                                      BasicBlock *Insert);
+  void printDebugTracesAtStart() override;
+  void printDebugTracesAtEnd() override;
+};
  } // end namespace llvm
  
  /// Look for a meaningful debug location on the instruction or it's
@@ -1077,6 +1229,9 @@ public:
    /// then this vectorization factor will be selected if vectorization is
    /// possible.
    VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+  VectorizationFactor
+  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
+                                    const LoopVectorizationPlanner &LVP);
  
    /// Setup cost-based decisions for user vectorization factor.
    void selectUserVectorizationFactor(ElementCount UserVF) {
@@ -1610,6 +1765,16 @@ private:
          Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
    }
  
+  /// Determines if we have the infrastructure to vectorize loop \p L and its
+  /// epilogue, assuming the main loop is vectorized by \p VF.
+  bool isCandidateForEpilogueVectorization(const Loop &L,
+                                           const ElementCount VF) const;
+
+  /// Returns true if epilogue vectorization is considered profitable, and
+  /// false otherwise.
+  /// \p VF is the vectorization factor chosen for the original loop.
+  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+
  public:
    /// The loop that we evaluate.
    Loop *TheLoop;
@@ -1652,6 +1817,9 @@ public:
  
    /// Values to ignore in the cost model when VF > 1.
    SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+  /// Profitable vector factors.
+  SmallVector<VectorizationFactor, 8> ProfitableVFs;
  };
  
  } // end namespace llvm
@@ -3139,9 +3307,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
    return Lp;
  }
  
-void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
-                                                      Value *VectorTripCount) {
+void InnerLoopVectorizer::createInductionResumeValues(
+    Loop *L, Value *VectorTripCount,
+    std::pair<BasicBlock *, Value *> AdditionalBypass) {
    assert(VectorTripCount && L && "Expected valid arguments");
+  assert(((AdditionalBypass.first && AdditionalBypass.second) ||
+          (!AdditionalBypass.first && !AdditionalBypass.second)) &&
+         "Inconsistent information about additional bypass.");
    // We are going to resume the execution of the scalar loop.
    // Go over all of the induction variables that we found and fix the
    // PHIs that are left in the scalar version of the loop.
@@ -3160,6 +3332,7 @@ void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
      // Copy original phi DL over to the new one.
      BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
      Value *&EndValue = IVEndValues[OrigPhi];
+    Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
      if (OrigPhi == OldInduction) {
        // We know what the end value is.
        EndValue = VectorTripCount;
@@ -3172,8 +3345,19 @@ void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
        const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
        EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
        EndValue->setName("ind.end");
-    }
  
+      // Compute the end value for the additional bypass (if applicable).
+      if (AdditionalBypass.first) {
+        B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
+        CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
+                                         StepType, true);
+        CRD =
+            B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+        EndValueFromAdditionalBypass =
+            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+        EndValueFromAdditionalBypass->setName("ind.end");
+      }
+    }
      // The new PHI merges the original incoming value, in case of a bypass,
      // or the value at the end of the vectorized loop.
      BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3183,6 +3367,11 @@ void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
      // value.
      for (BasicBlock *BB : LoopBypassBlocks)
        BCResumeVal->addIncoming(II.getStartValue(), BB);
+
+    if (AdditionalBypass.first)
+      BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
+                                            EndValueFromAdditionalBypass);
+
      OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
    }
  }
@@ -5495,6 +5684,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
                   << " because it will not generate any vector instructions.\n");
        continue;
      }
+
+    // If profitable add it to ProfitableVF list.
+    if (VectorCost < ScalarCost) {
+      ProfitableVFs.push_back(VectorizationFactor(
+          {ElementCount::getFixed(i), (unsigned)VectorCost}));
+    }
+
      if (VectorCost < Cost) {
        Cost = VectorCost;
        Width = i;
@@ -5518,6 +5714,117 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
    return Factor;
  }
  
+bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+    const Loop &L, ElementCount VF) const {
+  // Cross iteration phis such as reductions need special handling and are
+  // currently unsupported.
+  if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
+        return Legal->isFirstOrderRecurrence(&Phi) ||
+               Legal->isReductionVariable(&Phi);
+      }))
+    return false;
+
+  // Phis with uses outside of the loop require special handling and are
+  // currently unsupported.
+  for (auto &Entry : Legal->getInductionVars()) {
+    // Look for uses of the value of the induction at the last iteration.
+    Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+    for (User *U : PostInc->users())
+      if (!L.contains(cast<Instruction>(U)))
+        return false;
+    // Look for uses of penultimate value of the induction.
+    for (User *U : Entry.first->users())
+      if (!L.contains(cast<Instruction>(U)))
+        return false;
+  }
+
+  // Induction variables that are widened require special handling that is
+  // currently not supported.
+  if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
+        return !(isScalarAfterVectorization(Entry.first, VF) ||
+                 isProfitableToScalarize(Entry.first, VF));
+      }))
+    return false;
+
+  return true;
+}
+
+bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+    const ElementCount VF) const {
+  // FIXME: We need a much better cost-model to take different parameters such
+  // as register pressure, code size increase and cost of extra branches into
+  // account. For now we apply a very crude heuristic and only consider loops
+  // with vectorization factors larger than a certain value.
+  // We also consider epilogue vectorization unprofitable for targets that don't
+  // consider interleaving beneficial (eg. MVE).
+  if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
+    return false;
+  if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
+    return true;
+  return false;
+}
+
+VectorizationFactor
+LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
+    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+  VectorizationFactor Result = VectorizationFactor::Disabled();
+  if (!EnableEpilogueVectorization) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
+    return Result;
+  }
+
+  if (!isScalarEpilogueAllowed()) {
+    LLVM_DEBUG(
+        dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
+                  "allowed.\n";);
+    return Result;
+  }
+
+  // Not really a cost consideration, but check for unsupported cases here to
+  // simplify the logic.
+  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
+    LLVM_DEBUG(
+        dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
+                  "not a supported candidate.\n";);
+    return Result;
+  }
+
+  if (EpilogueVectorizationForceVF > 1) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
+    if (LVP.hasPlanWithVFs(
+            {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
+      return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+    else {
+      LLVM_DEBUG(
+          dbgs()
+              << "LEV: Epilogue vectorization forced factor is not viable.\n";);
+      return Result;
+    }
+  }
+
+  if (TheLoop->getHeader()->getParent()->hasOptSize() ||
+      TheLoop->getHeader()->getParent()->hasMinSize()) {
+    LLVM_DEBUG(
+        dbgs()
+            << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
+    return Result;
+  }
+
+  if (!isEpilogueVectorizationProfitable(MainLoopVF))
+    return Result;
+
+  for (auto &NextVF : ProfitableVFs)
+    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+        (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
+        LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+      Result = NextVF;
+
+  if (Result != VectorizationFactor::Disabled())
+    LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
+                      << Result.Width.getFixedValue() << "\n";);
+  return Result;
+}
+
  std::pair<unsigned, unsigned>
  LoopVectorizationCostModel::getSmallestAndWidestTypes() {
    unsigned MinWidth = -1U;
@@ -7122,6 +7429,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
    State.TripCount = ILV.getOrCreateTripCount(nullptr);
    State.CanonicalIV = ILV.Induction;
  
+  ILV.printDebugTracesAtStart();
+
    //===------------------------------------------------===//
    //
    // Notice: any optimization or new instruction that go
@@ -7137,6 +7446,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
    // 3. Fix the vectorized code: take care of header phi's, live-outs,
    //    predication, updating analyses.
    ILV.fixVectorizedLoop();
+
+  ILV.printDebugTracesAtEnd();
  }
  
  void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
@@ -7243,6 +7554,276 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
    }
  }
  
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerMainLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  Loop *Lp = createVectorLoopSkeleton("");
+
+  // Generate the code to check the minimum iteration count of the vector
+  // epilogue (see below).
+  EPI.EpilogueIterationCountCheck =
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+  EPI.EpilogueIterationCountCheck->setName("iter.check");
+
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  BasicBlock *SavedPreHeader = LoopVectorPreHeader;
+  emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+  // If a safety check was generated save it.
+  if (SavedPreHeader != LoopVectorPreHeader)
+    EPI.SCEVSafetyCheck = SavedPreHeader;
+
+  // Generate the code that checks at runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  SavedPreHeader = LoopVectorPreHeader;
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+  // If a safety check was generated save/overwite it.
+  if (SavedPreHeader != LoopVectorPreHeader)
+    EPI.MemSafetyCheck = SavedPreHeader;
+
+  // Generate the iteration count check for the main loop, *after* the check
+  // for the epilogue loop, so that the path-length is shorter for the case
+  // that goes directly through the vector epilogue. The longer-path length for
+  // the main loop is compensated for, by the gain from vectorizing the larger
+  // trip count. Note: the branch will get updated later on when we vectorize
+  // the epilogue.
+  EPI.MainLoopIterationCountCheck =
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+
+  // Generate the induction variable.
+  OldInduction = Legal->getPrimaryInduction();
+  Type *IdxTy = Legal->getWidestInductionType();
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  EPI.VectorTripCount = CountRoundDown;
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Skip induction resume value creation here because they will be created in
+  // the second pass. If we created them here, they wouldn't be used anyway,
+  // because the vplan in the second pass still contains the inductions from the
+  // original loop.
+
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
+  LLVM_DEBUG({
+    dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << ", Main Loop UF:" << EPI.MainLoopUF
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+  });
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
+  DEBUG_WITH_TYPE(VerboseDebug, {
+    dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+  });
+}
+
+BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
+    Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
+  assert(L && "Expected valid Loop.");
+  assert(Bypass && "Expected valid bypass basic block.");
+  unsigned VFactor =
+      ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
+  Value *Count = getOrCreateTripCount(L);
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+  // Generate code to check if the loop's trip count is less than VF * UF of the
+  // main vector loop.
+  auto P =
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+      "min.iters.check");
+
+  if (!ForEpilogue)
+    TCCheckBlock->setName("vector.main.loop.iter.check");
+
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+                                   DT, LI, nullptr, "vector.ph");
+
+  if (ForEpilogue) {
+    assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+                                 DT->getNode(Bypass)->getIDom()) &&
+           "TC check is expected to dominate Bypass");
+
+    // Update dominator for Bypass & LoopExit.
+    DT->changeImmediateDominator(Bypass, TCCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+    LoopBypassBlocks.push_back(TCCheckBlock);
+
+    // Save the trip count so we don't have to regenerate it in the
+    // vec.epilog.iter.check. This is safe to do because the trip count
+    // generated here dominates the vector epilog iter check.
+    EPI.TripCount = Count;
+  }
+
+  ReplaceInstWithInst(
+      TCCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+  return TCCheckBlock;
+}
+
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerEpilogueLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+
+  // Now, compare the remaining count and if there aren't enough iterations to
+  // execute the vectorized epilogue skip to the scalar part.
+  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
+  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
+  LoopVectorPreHeader =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 LI, nullptr, "vec.epilog.ph");
+  emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+                                          VecEpilogueIterationCountCheck);
+
+  // Adjust the control flow taking the state info from the main loop
+  // vectorization into account.
+  assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
+         "expected this to be saved from the previous pass.");
+  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
+      VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+
+  DT->changeImmediateDominator(LoopVectorPreHeader,
+                               EPI.MainLoopIterationCountCheck);
+
+  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
+      VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+  if (EPI.SCEVSafetyCheck)
+    EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+  if (EPI.MemSafetyCheck)
+    EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+  DT->changeImmediateDominator(
+      VecEpilogueIterationCountCheck,
+      VecEpilogueIterationCountCheck->getSinglePredecessor());
+
+  DT->changeImmediateDominator(LoopScalarPreHeader,
+                               EPI.EpilogueIterationCountCheck);
+  DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
+
+  // Keep track of bypass blocks, as they feed start values to the induction
+  // phis in the scalar loop preheader.
+  if (EPI.SCEVSafetyCheck)
+    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
+  if (EPI.MemSafetyCheck)
+    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
+  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+
+  // Generate a resume induction for the vector epilogue and put it in the
+  // vector epilogue preheader
+  Type *IdxTy = Legal->getWidestInductionType();
+  PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
+                                         LoopVectorPreHeader->getFirstNonPHI());
+  EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
+  EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
+                           EPI.MainLoopIterationCountCheck);
+
+  // Generate the induction variable.
+  OldInduction = Legal->getPrimaryInduction();
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Value *StartIdx = EPResumeVal;
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Generate induction resume values. These variables save the new starting
+  // indexes for the scalar loop. They are used to test if there are any tail
+  // iterations left once the vector loop has completed.
+  // Note that when the vectorized epilogue is skipped due to iteration count
+  // check, then the resume value for the induction variable comes from
+  // the trip count of the main vector loop, hence passing the AdditionalBypass
+  // argument.
+  createInductionResumeValues(Lp, CountRoundDown,
+                              {VecEpilogueIterationCountCheck,
+                               EPI.VectorTripCount} /* AdditionalBypass */);
+
+  AddRuntimeUnrollDisableMetaData(Lp);
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
+    Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+
+  assert(EPI.TripCount &&
+         "Expected trip count to have been safed in the first pass.");
+  assert(!isa<Instruction>(EPI.TripCount) ||
+         DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert) &&
+             "saved trip count does not dominate insertion point.");
+  Value *TC = EPI.TripCount;
+  IRBuilder<> Builder(Insert->getTerminator());
+  Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
+
+  // Generate code to check if the loop's trip count is less than VF * UF of the
+  // vector epilogue loop.
+  auto P =
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count,
+      ConstantInt::get(Count->getType(),
+                       EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
+      "min.epilog.iters.check");
+
+  ReplaceInstWithInst(
+      Insert->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+  LoopBypassBlocks.push_back(Insert);
+  return Insert;
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
+  LLVM_DEBUG({
+    dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << ", Main Loop UF:" << EPI.MainLoopUF
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+  });
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
+  DEBUG_WITH_TYPE(VerboseDebug, {
+    dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+  });
+}
+
  bool LoopVectorizationPlanner::getDecisionAndClampRange(
      const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
    assert(!Range.isEmpty() && "Trying to test an empty VF range.");
@@ -8608,16 +9189,51 @@ bool LoopVectorizePass::processLoop(Loop *L) {
      });
    } else {
      // If we decided that it is *legal* to vectorize the loop, then do it.
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM, BFI, PSI);
-    LVP.executePlan(LB, DT);
-    ++LoopsVectorized;
-
-    // Add metadata to disable runtime unrolling a scalar loop when there are
-    // no runtime checks about strides and memory. A scalar loop that is
-    // rarely used is not worth unrolling.
-    if (!LB.areSafetyChecksAdded())
-      DisableRuntimeUnroll = true;
+
+    // Consider vectorizing the epilogue too if it's profitable.
+    VectorizationFactor EpilogueVF =
+      CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+    if (EpilogueVF.Width.isVector()) {
+
+      // The first pass vectorizes the main loop and creates a scalar epilogue
+      // to be vectorized by executing the plan (potentially with a different
+      // factor) again shortly afterwards.
+      EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
+                                        EpilogueVF.Width.getKnownMinValue(), 1);
+      EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
+                                         &LVL, &CM, BFI, PSI);
+
+      LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+      LVP.executePlan(MainILV, DT);
+      ++LoopsVectorized;
+
+      simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+      formLCSSARecursively(*L, *DT, LI, SE);
+
+      // Second pass vectorizes the epilogue and adjusts the control flow
+      // edges from the first pass.
+      LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
+      EPI.MainLoopVF = EPI.EpilogueVF;
+      EPI.MainLoopUF = EPI.EpilogueUF;
+      EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
+                                               ORE, EPI, &LVL, &CM, BFI, PSI);
+      LVP.executePlan(EpilogILV, DT);
+      ++LoopsEpilogueVectorized;
+
+      if (!MainILV.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+    } else {
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+                             &LVL, &CM, BFI, PSI);
+      LVP.executePlan(LB, DT);
+      ++LoopsVectorized;
+
+      // Add metadata to disable runtime unrolling a scalar loop when there are
+      // no runtime checks about strides and memory. A scalar loop that is
+      // rarely used is not worth unrolling.
+      if (!LB.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+    }
  
      // Report the vectorization decision.
      ORE->emit([&]() {
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll

new file mode 100644 (file)

index 0000000..7cb5f34
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
@@ -0,0 +1,133 @@
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s
+
+; TODO: For now test for the `-epilogue-vectorization-minimum-VF` option. In
+; the future we need to replace this with a more meaningful test of the
+; epilogue vectorization cost-model.
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Do not vectorize epilogues for loops with minsize attribute
+; CHECK-LABLE: @f1
+; CHECK-NOT: vector.main.loop.iter.check
+; CHECK-NOT: vec.epilog.iter.check
+; CHECK-NOT: vec.epilog.ph
+; CHECK-NOT: vec.epilog.vector.body
+; CHECK-NOT: vec.epilog.middle.block
+
+define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 {
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Do not vectorize epilogues for loops with optsize attribute
+; CHECK-LABLE: @f2
+; CHECK-NOT: vector.main.loop.iter.check
+; CHECK-NOT: vec.epilog.iter.check
+; CHECK-NOT: vec.epilog.ph
+; CHECK-NOT: vec.epilog.vector.body
+; CHECK-NOT: vec.epilog.middle.block
+
+define dso_local void @f2(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #1 {
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16.
+; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-D-NOT: vector.main.loop.iter.check
+; CHECK-MIN-D-NOT: vec.epilog.iter.check
+; CHECK-MIN-D-NOT: vec.epilog.ph
+; CHECK-MIN-D-NOT: vec.epilog.vector.body
+; CHECK-MIN-D-NOT: vec.epilog.middle.block
+
+; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and
+; make sure the epilogue gets vectorized in that case.
+; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-4: vector.main.loop.iter.check
+; CHECK-MIN-4: vec.epilog.iter.check
+; CHECK-MIN-4: vec.epilog.ph
+; CHECK-MIN-4: vec.epilog.vector.body
+; CHECK-MIN-4: vec.epilog.middle.block
+
+define dso_local void @f3(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) {
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { minsize }
+attributes #1 = { optsize }
+\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll

new file mode 100644 (file)

index 0000000..82f73e2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -0,0 +1,593 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s --check-prefix VF-FOUR-CHECK
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 {
+; VF-TWO-CHECK-LABEL: @f1(
+; VF-TWO-CHECK-NEXT:  entry:
+; VF-TWO-CHECK-NEXT:    [[AA1:%.*]] = bitcast float* [[AA:%.*]] to i8*
+; VF-TWO-CHECK-NEXT:    [[BB3:%.*]] = bitcast float* [[BB:%.*]] to i8*
+; VF-TWO-CHECK-NEXT:    [[CC6:%.*]] = bitcast float* [[CC:%.*]] to i8*
+; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; VF-TWO-CHECK:       iter.check:
+; VF-TWO-CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; VF-TWO-CHECK:       vector.memcheck:
+; VF-TWO-CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[AA]], i64 [[WIDE_TRIP_COUNT]]
+; VF-TWO-CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; VF-TWO-CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[BB]], i64 [[WIDE_TRIP_COUNT]]
+; VF-TWO-CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
+; VF-TWO-CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr float, float* [[CC]], i64 [[WIDE_TRIP_COUNT]]
+; VF-TWO-CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8*
+; VF-TWO-CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP45]]
+; VF-TWO-CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BB3]], [[SCEVGEP2]]
+; VF-TWO-CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; VF-TWO-CHECK-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP78]]
+; VF-TWO-CHECK-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[CC6]], [[SCEVGEP2]]
+; VF-TWO-CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
+; VF-TWO-CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; VF-TWO-CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
+; VF-TWO-CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-TWO-CHECK:       vector.main.loop.iter.check:
+; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK12:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 48
+; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK12]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF-TWO-CHECK:       vector.ph:
+; VF-TWO-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 48
+; VF-TWO-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; VF-TWO-CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF-TWO-CHECK:       vector.body:
+; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; VF-TWO-CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
+; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
+; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
+; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 20
+; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 24
+; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 28
+; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 32
+; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 36
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 40
+; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 44
+; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP1]]
+; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP2]]
+; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP3]]
+; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP4]]
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP5]]
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP6]]
+; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP7]]
+; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP8]]
+; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP9]]
+; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP10]]
+; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP11]]
+; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, <4 x float>* [[TMP27]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP29]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP31]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP33]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, <4 x float>* [[TMP35]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 28
+; VF-TWO-CHECK-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, <4 x float>* [[TMP39]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 32
+; VF-TWO-CHECK-NEXT:    [[TMP41:%.*]] = bitcast float* [[TMP40]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP41]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 36
+; VF-TWO-CHECK-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, <4 x float>* [[TMP43]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 40
+; VF-TWO-CHECK-NEXT:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, <4 x float>* [[TMP45]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 44
+; VF-TWO-CHECK-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, <4 x float>* [[TMP47]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP1]]
+; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP2]]
+; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP3]]
+; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP4]]
+; VF-TWO-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP5]]
+; VF-TWO-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP6]]
+; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP7]]
+; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP8]]
+; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP9]]
+; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP10]]
+; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP11]]
+; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = bitcast float* [[TMP62]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <4 x float>, <4 x float>* [[TMP63]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = bitcast float* [[TMP68]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <4 x float>, <4 x float>* [[TMP69]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 28
+; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = bitcast float* [[TMP74]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP75]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 32
+; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 36
+; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 40
+; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = bitcast float* [[TMP80]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <4 x float>, <4 x float>* [[TMP81]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 44
+; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD35:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD24]]
+; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD13]], [[WIDE_LOAD25]]
+; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[WIDE_LOAD14]], [[WIDE_LOAD26]]
+; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[WIDE_LOAD15]], [[WIDE_LOAD27]]
+; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[WIDE_LOAD16]], [[WIDE_LOAD28]]
+; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[WIDE_LOAD17]], [[WIDE_LOAD29]]
+; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[WIDE_LOAD18]], [[WIDE_LOAD30]]
+; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[WIDE_LOAD19]], [[WIDE_LOAD31]]
+; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <4 x float> [[WIDE_LOAD20]], [[WIDE_LOAD32]]
+; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <4 x float> [[WIDE_LOAD21]], [[WIDE_LOAD33]]
+; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <4 x float> [[WIDE_LOAD22]], [[WIDE_LOAD34]]
+; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <4 x float> [[WIDE_LOAD23]], [[WIDE_LOAD35]]
+; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP1]]
+; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP2]]
+; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP3]]
+; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP4]]
+; VF-TWO-CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP5]]
+; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP6]]
+; VF-TWO-CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP7]]
+; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP8]]
+; VF-TWO-CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP9]]
+; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP10]]
+; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP11]]
+; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP109]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP111]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 8
+; VF-TWO-CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP113]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 12
+; VF-TWO-CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP115]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 16
+; VF-TWO-CHECK-NEXT:    [[TMP117:%.*]] = bitcast float* [[TMP116]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP117]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 20
+; VF-TWO-CHECK-NEXT:    [[TMP119:%.*]] = bitcast float* [[TMP118]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP119]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 24
+; VF-TWO-CHECK-NEXT:    [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP121]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 28
+; VF-TWO-CHECK-NEXT:    [[TMP123:%.*]] = bitcast float* [[TMP122]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP123]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 32
+; VF-TWO-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP92]], <4 x float>* [[TMP125]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 36
+; VF-TWO-CHECK-NEXT:    [[TMP127:%.*]] = bitcast float* [[TMP126]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP93]], <4 x float>* [[TMP127]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 40
+; VF-TWO-CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP94]], <4 x float>* [[TMP129]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 44
+; VF-TWO-CHECK-NEXT:    [[TMP131:%.*]] = bitcast float* [[TMP130]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP95]], <4 x float>* [[TMP131]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 48
+; VF-TWO-CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV:!.*]]
+; VF-TWO-CHECK:       middle.block:
+; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; VF-TWO-CHECK:       vec.epilog.iter.check:
+; VF-TWO-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; VF-TWO-CHECK:       vec.epilog.ph:
+; VF-TWO-CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    [[N_MOD_VF36:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    [[N_VEC37:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF36]]
+; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_BODY:%.*]]
+; VF-TWO-CHECK:       vec.epilog.vector.body:
+; VF-TWO-CHECK-NEXT:    [[INDEX38:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT39:%.*]], [[VEC_EPILOG_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[INDEX38]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
+; VF-TWO-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD41:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
+; VF-TWO-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD42:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD41]], [[WIDE_LOAD42]]
+; VF-TWO-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
+; VF-TWO-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>*
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT39]] = add i64 [[INDEX38]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC37]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_BODY]], !llvm.loop [[LOOPID_EV:!.*]]
+; VF-TWO-CHECK:       vec.epilog.middle.block:
+; VF-TWO-CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC37]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; VF-TWO-CHECK:       scalar.ph:
+; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC37]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; VF-TWO-CHECK:       for.body:
+; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    [[TMP145:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    [[TMP146:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP145]], [[TMP146]]
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX4]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS:!.*]]
+; VF-TWO-CHECK:       for.end.loopexit.loopexit:
+; VF-TWO-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; VF-TWO-CHECK:       for.end.loopexit:
+; VF-TWO-CHECK-NEXT:    br label [[FOR_END]]
+; VF-TWO-CHECK:       for.end:
+; VF-TWO-CHECK-NEXT:    ret void
+;
+; VF-TWO-CHECK-DAG: [[LOOPID_MV]] = distinct !{[[LOOPID_MV]], [[LOOPID_DISABLE_VECT:!.*]]}
+; VF-TWO-CHECK-DAG: [[LOOPID_EV]] = distinct !{[[LOOPID_EV]], [[LOOPID_DISABLE_UNROLL:!.*]], [[LOOPID_DISABLE_VECT:!.*]]}
+; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_VECT]] = [[DISABLE_VECT_STR:!{!"llvm.loop.isvectorized".*}.*]]
+; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_UNROLL]] = [[DISABLE_UNROLL_STR:!{!"llvm.loop.unroll.runtime.disable"}.*]]
+
+
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signext %n) #0 {
+; VF-FOUR-CHECK-LABEL: @f2(
+; VF-FOUR-CHECK-NEXT:  entry:
+; VF-FOUR-CHECK-NEXT:    [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
+; VF-FOUR-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
+; VF-FOUR-CHECK:       iter.check:
+; VF-FOUR-CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; VF-FOUR-CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP0]] to i64
+; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; VF-FOUR-CHECK:       vector.scevcheck:
+; VF-FOUR-CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; VF-FOUR-CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
+; VF-FOUR-CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; VF-FOUR-CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; VF-FOUR-CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[MUL_RESULT]]
+; VF-FOUR-CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-FOUR-CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP3]], [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP7:%.*]] = select i1 true, i1 [[TMP5]], i1 [[TMP6]]
+; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; VF-FOUR-CHECK-NEXT:    [[TMP11:%.*]] = or i1 false, [[TMP10]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MEM_CHECK:%.*]]
+; VF-FOUR-CHECK:       vector.memcheck:
+; VF-FOUR-CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 [[WIDE_TRIP_COUNT]]
+; VF-FOUR-CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP0]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 1
+; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP13]], [[WIDE_TRIP_COUNT]]
+; VF-FOUR-CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP14]]
+; VF-FOUR-CHECK-NEXT:    [[SCEVGEP34:%.*]] = bitcast float* [[SCEVGEP3]] to i8*
+; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP12]], 1
+; VF-FOUR-CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP15]]
+; VF-FOUR-CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast float* [[SCEVGEP5]] to i8*
+; VF-FOUR-CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP56]]
+; VF-FOUR-CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
+; VF-FOUR-CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; VF-FOUR-CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; VF-FOUR-CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-FOUR-CHECK:       vector.main.loop.iter.check:
+; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
+; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF-FOUR-CHECK:       vector.ph:
+; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
+; VF-FOUR-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; VF-FOUR-CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF-FOUR-CHECK:       vector.body:
+; VF-FOUR-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 8
+; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 12
+; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 16
+; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 20
+; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 24
+; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 28
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-FOUR-CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-FOUR-CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-FOUR-CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[TMP24]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = xor i32 [[TMP25]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP26]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = xor i32 [[TMP27]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP28]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP29]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = xor i32 [[TMP30]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP31]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP36]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP37]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP38]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP39]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = sext i32 [[TMP44]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP53:%.*]] = sext i32 [[TMP45]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP54:%.*]] = sext i32 [[TMP46]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = sext i32 [[TMP47]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
+; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
+; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
+; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
+; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP52]]
+; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP53]]
+; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP54]]
+; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP55]]
+; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP64]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = bitcast float* [[TMP65]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP66]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -4
+; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP67]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = bitcast float* [[TMP68]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP69]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -8
+; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP70]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = bitcast float* [[TMP71]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP72]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -12
+; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP73]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = bitcast float* [[TMP74]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP75]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -16
+; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP76]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = bitcast float* [[TMP77]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP78]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -20
+; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[TMP79]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = bitcast float* [[TMP80]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP81]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x float> [[WIDE_LOAD16]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -24
+; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, float* [[TMP82]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = bitcast float* [[TMP83]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, <4 x float>* [[TMP84]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x float> [[WIDE_LOAD18]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -28
+; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, float* [[TMP85]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = bitcast float* [[TMP86]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP87]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x float> [[WIDE_LOAD20]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <4 x float> [[REVERSE17]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <4 x float> [[REVERSE19]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <4 x float> [[REVERSE21]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
+; VF-FOUR-CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
+; VF-FOUR-CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
+; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP105]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 4
+; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP107]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 8
+; VF-FOUR-CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP109]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 12
+; VF-FOUR-CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP111]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 16
+; VF-FOUR-CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP92]], <4 x float>* [[TMP113]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 20
+; VF-FOUR-CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP93]], <4 x float>* [[TMP115]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 24
+; VF-FOUR-CHECK-NEXT:    [[TMP117:%.*]] = bitcast float* [[TMP116]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP94]], <4 x float>* [[TMP117]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 28
+; VF-FOUR-CHECK-NEXT:    [[TMP119:%.*]] = bitcast float* [[TMP118]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP95]], <4 x float>* [[TMP119]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP120]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:!.*]]
+; VF-FOUR-CHECK:       middle.block:
+; VF-FOUR-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; VF-FOUR-CHECK:       vec.epilog.iter.check:
+; VF-FOUR-CHECK-NEXT:    [[IND_END27:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-FOUR-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; VF-FOUR-CHECK:       vec.epilog.ph:
+; VF-FOUR-CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF22:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; VF-FOUR-CHECK-NEXT:    [[N_VEC23:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF22]]
+; VF-FOUR-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC23]] to i32
+; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; VF-FOUR-CHECK:       vec.epilog.vector.body:
+; VF-FOUR-CHECK-NEXT:    [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP121:%.*]] = add i64 [[INDEX24]], 0
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX29:%.*]] = trunc i64 [[INDEX24]] to i32
+; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = add i32 [[OFFSET_IDX29]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP123:%.*]] = xor i32 [[TMP122]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = add i32 [[TMP123]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP125:%.*]] = sext i32 [[TMP124]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP125]]
+; VF-FOUR-CHECK-NEXT:    [[TMP127:%.*]] = getelementptr inbounds float, float* [[TMP126]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP129]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x float> [[WIDE_LOAD30]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[TMP130:%.*]] = fadd fast <4 x float> [[REVERSE31]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP131:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP121]]
+; VF-FOUR-CHECK-NEXT:    [[TMP132:%.*]] = getelementptr inbounds float, float* [[TMP131]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP133:%.*]] = bitcast float* [[TMP132]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP130]], <4 x float>* [[TMP133]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT25]] = add i64 [[INDEX24]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP134:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC23]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP134]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:!.*]]
+; VF-FOUR-CHECK:       vec.epilog.middle.block:
+; VF-FOUR-CHECK-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC23]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N28]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
+; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEM_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEM_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; VF-FOUR-CHECK:       for.body:
+; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL26]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP135:%.*]] = xor i32 [[I_014]], -1
+; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP135]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
+; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
+; VF-FOUR-CHECK-NEXT:    [[TMP136:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP136]], 1.000000e+00
+; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; VF-FOUR-CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; VF-FOUR-CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
+; VF-FOUR-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:!.*]]
+; VF-FOUR-CHECK:       for.end.loopexit.loopexit:
+; VF-FOUR-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; VF-FOUR-CHECK:       for.end.loopexit:
+; VF-FOUR-CHECK-NEXT:    br label [[FOR_END]]
+; VF-FOUR-CHECK:       for.end:
+; VF-FOUR-CHECK-NEXT:    ret i32 0
+
+; VF-FOUR-CHECK-DAG: [[LOOPID_MV_CM]] = distinct !{[[LOOPID_MV_CM]], [[LOOPID_DISABLE_VECT_CM:!.*]]}
+; VF-FOUR-CHECK-DAG: [[LOOPID_EV_CM]] = distinct !{[[LOOPID_EV_CM]], [[LOOPID_DISABLE_UNROLL_CM:!.*]], [[LOOPID_DISABLE_VECT_CM:!.*]]}
+; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_VECT_CM]] = [[DISABLE_VECT_STR_CM:!{!"llvm.loop.isvectorized".*}.*]]
+; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_UNROLL_CM]] = [[DISABLE_UNROLL_STR_CM:!{!"llvm.loop.unroll.runtime.disable"}.*]]
+
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %n, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %i.014 = phi i32 [ 0, %for.body.preheader ], [ %inc, %for.body ]
+  %1 = xor i32 %i.014, -1
+  %sub2 = add i32 %1, %n
+  %idxprom = sext i32 %sub2 to i64
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %idxprom
+  %2 = load float, float* %arrayidx, align 4
+  %conv3 = fadd fast float %2, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %conv3, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %inc = add nuw nsw i32 %i.014, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32 0
+}
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll

index cf6cc1356e0c466b29c6e15c67fbaa2800e6642c..ed352559b5bff28ca89eb1e545ebd98038283563 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -25,22 +25,22 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
  ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
  ; CHECK:       vector.ph:
  ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
  ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; CHECK:       vector.body:
  ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT6]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
  ; CHECK:       middle.block:
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
  ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1>
  ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
  ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15
@@ -51,9 +51,9 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
  ; CHECK:       for.body:
  ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
  ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[A]], null
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32* [[A]], null
  ; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[LATCH]], label [[COND_LOAD:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[LATCH]], label [[COND_LOAD:%.*]]
  ; CHECK:       cond_load:
  ; CHECK-NEXT:    [[ALOAD:%.*]] = load i32, i32* [[A]], align 4
  ; CHECK-NEXT:    br label [[LATCH]]
@@ -61,7 +61,7 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
  ; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ]
  ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
  ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP7:!llvm.loop !.*]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
  ; CHECK-NEXT:    ret i32 [[A_LCSSA_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll

index be1b7c2d7ae8bea0d4e086786aa946fc23ef9dd7..1a6f59316a448ad58edd4f3ff5e116807f4aea00 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -54,7 +54,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b
  ; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
  ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
  ; CHECK:       middle.block:
  ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
  ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]]
@@ -75,7 +75,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b
  ; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
  ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
  ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP7:!llvm.loop !.*]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
  ; CHECK-NEXT:    ret i32 [[T4]]
@@ -141,7 +141,7 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b,
  ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT6]], <16 x i32*> [[BROADCAST_SPLAT8]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]]
  ; CHECK:       middle.block:
  ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
  ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -161,7 +161,7 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b,
  ; CHECK:       latch:
  ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
  ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP14:!llvm.loop !.*]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    ret void
  ;
@@ -242,7 +242,7 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32*
  ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT19]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP23:!llvm.loop !.*]]
  ; CHECK:       middle.block:
  ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
  ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -264,7 +264,7 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32*
  ; CHECK:       latch:
  ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
  ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP24:!llvm.loop !.*]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    ret void
  ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll

index 8a4d46c0fa6bc5701ae4eaaa997a35f9bd99df8d..0eab7618bbe2048563e7aa9ec3a69c2cea1d9cd5 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -61,7 +61,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
  ; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7
  ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
  ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
  ; AVX1:       middle.block:
  ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
  ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -84,7 +84,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
  ; AVX1:       for.inc:
  ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
  ; AVX1:       for.end:
  ; AVX1-NEXT:    ret void
  ;
@@ -176,7 +176,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
  ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7
  ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
  ; AVX2:       middle.block:
  ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -199,7 +199,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
  ; AVX2:       for.inc:
  ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
  ; AVX2:       for.end:
  ; AVX2-NEXT:    ret void
  ;
@@ -291,7 +291,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
  ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
  ; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -314,7 +314,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
  ; AVX512:       for.end:
  ; AVX512-NEXT:    ret void
  ;
@@ -391,7 +391,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
  ; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18
  ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
  ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
  ; AVX1:       middle.block:
  ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
  ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -414,7 +414,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
  ; AVX1:       for.inc:
  ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
  ; AVX1:       for.end:
  ; AVX1-NEXT:    ret void
  ;
@@ -506,7 +506,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
  ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
  ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
  ; AVX2:       middle.block:
  ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -529,7 +529,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
  ; AVX2:       for.inc:
  ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
  ; AVX2:       for.end:
  ; AVX2-NEXT:    ret void
  ;
@@ -621,7 +621,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
  ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
  ; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -644,7 +644,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
  ; AVX512:       for.end:
  ; AVX512-NEXT:    ret void
  ;
@@ -731,7 +731,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
  ; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28
  ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
  ; AVX1-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX1-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
  ; AVX1:       middle.block:
  ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
  ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -755,7 +755,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
  ; AVX1:       for.inc:
  ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
  ; AVX1:       for.end:
  ; AVX1-NEXT:    ret void
  ;
@@ -851,7 +851,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
  ; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28
  ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
  ; AVX2:       middle.block:
  ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -875,7 +875,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
  ; AVX2:       for.inc:
  ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
  ; AVX2:       for.end:
  ; AVX2-NEXT:    ret void
  ;
@@ -971,7 +971,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
  ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !26, !noalias !28
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
  ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -995,7 +995,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
  ; AVX512:       for.end:
  ; AVX512-NEXT:    ret void
  ;
@@ -1131,7 +1131,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38
  ; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; AVX-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]]
  ; AVX:       middle.block:
  ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
  ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1155,7 +1155,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; AVX:       for.inc:
  ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]]
  ; AVX:       for.end:
  ; AVX-NEXT:    ret void
  ;
@@ -1251,7 +1251,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !36, !noalias !38
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1275,7 +1275,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]]
  ; AVX512:       for.end:
  ; AVX512-NEXT:    ret void
  ;
@@ -1384,7 +1384,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
  ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
  ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
-; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, 624
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1409,7 +1409,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
  ; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !50
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP50:!llvm.loop !.*]]
  ; AVX512:       for.end:
  ; AVX512-NEXT:    ret void
  ;
@@ -1664,7 +1664,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
  ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
  ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; AVX2-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]]
  ; AVX2:       middle.block:
  ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
  ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1687,7 +1687,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
  ; AVX2:       for.inc:
  ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
  ; AVX2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP50:!llvm.loop !.*]]
  ; AVX2:       for.end:
  ; AVX2-NEXT:    ret void
  ;
@@ -1808,7 +1808,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
  ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX512-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !59
+; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP59:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1831,7 +1831,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
  ; AVX512-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !60
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP60:!llvm.loop !.*]]
  ; AVX512:       for.end:
  ; AVX512-NEXT:    ret void
  ;
@@ -1963,7 +1963,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
  ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
  ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !41
+; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]]
  ; AVX1:       middle.block:
  ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
  ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -1989,7 +1989,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
  ; AVX1:       for.inc:
  ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !42
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP42:!llvm.loop !.*]]
  ; AVX1:       for.end.loopexit:
  ; AVX1-NEXT:    br label [[FOR_END]]
  ; AVX1:       for.end:
@@ -2087,7 +2087,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
  ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
  ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP51:!llvm.loop !.*]]
  ; AVX2:       middle.block:
  ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
  ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2113,7 +2113,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
  ; AVX2:       for.inc:
  ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !52
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP52:!llvm.loop !.*]]
  ; AVX2:       for.end.loopexit:
  ; AVX2-NEXT:    br label [[FOR_END]]
  ; AVX2:       for.end:
@@ -2211,7 +2211,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
  ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !61
+; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP61:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2237,7 +2237,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !62
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP62:!llvm.loop !.*]]
  ; AVX512:       for.end.loopexit:
  ; AVX512-NEXT:    br label [[FOR_END]]
  ; AVX512:       for.end:
@@ -2380,7 +2380,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
  ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
  ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !44
+; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP44:!llvm.loop !.*]]
  ; AVX1:       middle.block:
  ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
  ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2406,7 +2406,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
  ; AVX1:       for.inc:
  ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !45
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP45:!llvm.loop !.*]]
  ; AVX1:       for.end.loopexit:
  ; AVX1-NEXT:    br label [[FOR_END]]
  ; AVX1:       for.end:
@@ -2504,7 +2504,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
  ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
  ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
  ; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP54:!llvm.loop !.*]]
  ; AVX2:       middle.block:
  ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
  ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2530,7 +2530,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
  ; AVX2:       for.inc:
  ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !55
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP55:!llvm.loop !.*]]
  ; AVX2:       for.end.loopexit:
  ; AVX2-NEXT:    br label [[FOR_END]]
  ; AVX2:       for.end:
@@ -2628,7 +2628,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
  ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
  ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
  ; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !64
+; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP64:!llvm.loop !.*]]
  ; AVX512:       middle.block:
  ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
  ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2654,7 +2654,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
  ; AVX512:       for.inc:
  ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !65
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP65:!llvm.loop !.*]]
  ; AVX512:       for.end.loopexit:
  ; AVX512-NEXT:    br label [[FOR_END]]
  ; AVX512:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll

new file mode 100644 (file)

index 0000000..e4d62a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s  -passes='loop-vectorize' -force-vector-width=2 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+; Currently we cannot handle reduction loops.
+; CHECK: LV: Checking a loop in "f1"
+; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
+
+define signext i32 @f1(i8* noalias %A, i32 signext %n) {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.02 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %sum.02, %conv
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %sum.0.lcssa
+}
+
+; Currently we cannot handle live-out variables that are recurrences.
+; CHECK: LV: Checking a loop in "f2"
+; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
+
+define signext i32 @f2(i8* noalias %A, i32 signext %n) {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = add i8 %0, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  %inc.lcssa.wide = phi i64 [ %indvars.iv.next, %for.body ]
+  %1 = trunc i64 %inc.lcssa.wide to i32
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Currently we cannot handle widended/truncated inductions.
+; CHECK: LV: Checking a loop in "f3"
+; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
+
+define void @f3(i8* noalias %A, i32 signext %n) {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %conv = trunc i32 %0 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll

new file mode 100644 (file)

index 0000000..243d3e9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; To test epilogue-vectorization we need to make sure that the vectorizer actually vectorizes the loop.
+; Without a target triple this becomes difficult, unless we force vectorization through user hints.
+; Currently user provided vectorization hints prevent epilogue vectorization unless the forced
+; VF is the same as the epilogue vectorization VF. To make these tests target independent we'll use a
+; trick where both VFs are forced to be the same value.
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -force-vector-width=2 -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK
+
+target datalayout = "e-m:e-i64:64-n32:64"
+
+; Some limited forms of live-outs (non-reduction, non-recurrences) are supported.
+define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
+; VF-TWO-CHECK-LABEL: @f1(
+; VF-TWO-CHECK-NEXT:  entry:
+; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
+; VF-TWO-CHECK:       iter.check:
+; VF-TWO-CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-TWO-CHECK:       vector.main.loop.iter.check:
+; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF-TWO-CHECK:       vector.ph:
+; VF-TWO-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; VF-TWO-CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF-TWO-CHECK:       vector.body:
+; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; VF-TWO-CHECK:       middle.block:
+; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; VF-TWO-CHECK:       vec.epilog.iter.check:
+; VF-TWO-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; VF-TWO-CHECK:       vec.epilog.ph:
+; VF-TWO-CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2
+; VF-TWO-CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF4]]
+; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; VF-TWO-CHECK:       vec.epilog.vector.body:
+; VF-TWO-CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX6]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
+; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]]
+; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT7]] = add i64 [[INDEX6]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; VF-TWO-CHECK:       vec.epilog.middle.block:
+; VF-TWO-CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK:       vec.epilog.scalar.ph:
+; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; VF-TWO-CHECK:       for.body:
+; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]]
+; VF-TWO-CHECK:       for.end.loopexit.loopexit:
+; VF-TWO-CHECK-NEXT:    [[ADD_LCSSA3:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; VF-TWO-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; VF-TWO-CHECK:       for.end.loopexit:
+; VF-TWO-CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA3]], [[FOR_END_LOOPEXIT_LOOPEXIT]] ]
+; VF-TWO-CHECK-NEXT:    br label [[FOR_END]]
+; VF-TWO-CHECK:       for.end:
+; VF-TWO-CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; VF-TWO-CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %0, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %res.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll

new file mode 100644 (file)

index 0000000..8f5ffb1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -0,0 +1,402 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; To test epilogue-vectorization we need to make sure that the vectorizer actually vectorizes the loop.
+; Without a target triple this becomes difficult, unless we force vectorization through user hints.
+; Currently user provided vectorization hints prevent epilogue vectorization unless the forced
+; VF is the same as the epilogue vectorization VF. To make these tests target independent we'll use a
+; trick where both VFs are forced to be the same value. Mismatching VFs are tested in target specific tests.
+; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s
+
+; Some simpler cases are found profitable even without triple or user hints.
+; RUN: opt -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S %s | FileCheck --check-prefix=CHECK-PROFITABLE-BY-DEFAULT %s
+
+target datalayout = "e-m:e-i64:64-n32:64-v128:128:128"
+
+define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AA1:%.*]] = bitcast float* [[AA:%.*]] to i8*
+; CHECK-NEXT:    [[BB3:%.*]] = bitcast float* [[BB:%.*]] to i8*
+; CHECK-NEXT:    [[CC6:%.*]] = bitcast float* [[CC:%.*]] to i8*
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[AA]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[BB]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr float, float* [[CC]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP45]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BB3]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP78]]
+; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[CC6]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK12:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK12]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF14:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC15:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF14]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX16]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD19]], [[WIDE_LOAD20]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP19]], <4 x float>* [[TMP22]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT17]] = add i64 [[INDEX16]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC15]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC15]]
+; CHECK-NEXT:    br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP12:!llvm.loop !.*]]
+; CHECK:       for.end.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signext %n) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 true, i1 [[TMP5]], i1 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 false, [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP13]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP14]]
+; CHECK-NEXT:    [[SCEVGEP34:%.*]] = bitcast float* [[SCEVGEP3]] to i8*
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP12]], 1
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast float* [[SCEVGEP5]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP56]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP17]], -1
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[N]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 -3
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP23]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP24]], align 4, !alias.scope !13
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP26]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP25]], <4 x float>* [[TMP28]], align 4, !alias.scope !16, !noalias !13
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[IND_END13:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]]
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC9]] to i32
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX10]], 0
+; CHECK-NEXT:    [[OFFSET_IDX15:%.*]] = trunc i64 [[INDEX10]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[OFFSET_IDX15]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[TMP31]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP32]], [[N]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP35]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP36]], i32 -3
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float* [[TMP37]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP38]], align 4
+; CHECK-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x float> [[WIDE_LOAD16]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP39:%.*]] = fadd fast <4 x float> [[REVERSE17]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast float* [[TMP41]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP39]], <4 x float>* [[TMP42]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add i64 [[INDEX10]], 4
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]]
+; CHECK-NEXT:    br i1 [[TMP43]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
+; CHECK-NEXT:    br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL12:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i32 [[I_014]], -1
+; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP44]], [[N]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP45]], 1.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP20:!llvm.loop !.*]]
+; CHECK:       for.end.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %n, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %i.014 = phi i32 [ 0, %for.body.preheader ], [ %inc, %for.body ]
+  %1 = xor i32 %i.014, -1
+  %sub2 = add i32 %1, %n
+  %idxprom = sext i32 %sub2 to i64
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %idxprom
+  %2 = load float, float* %arrayidx, align 4
+  %conv3 = fadd fast float %2, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %conv3, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %inc = add nuw nsw i32 %i.014, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32 0
+}
+
+define void @f3(i8* noalias %A, i64 %n) {
+; CHECK-PROFITABLE-BY-DEFAULT-LABEL: @f3(
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:  iter.check:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vector.main.loop.iter.check:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vector.ph:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vector.body:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, <4 x i8>* [[TMP3]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       middle.block:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.iter.check:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.ph:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.vector.body:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX4]], 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> <i8 1, i8 1>, <2 x i8>* [[TMP8]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT5]] = add i64 [[INDEX4]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.middle.block:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.scalar.ph:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       for.body:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store i8 1, i8* [[ARRAYIDX]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       for.end.loopexit.loopexit:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK-PROFITABLE-BY-DEFAULT:       for.end.loopexit:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[FOR_END:%.*]]
+; CHECK-PROFITABLE-BY-DEFAULT:       for.end:
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
+  store i8 1, i8* %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
author	Bardia Mahjour <bmahjour@ca.ibm.com>
	Tue, 1 Dec 2020 16:57:16 +0000 (11:57 -0500)
committer	Bardia Mahjour <bmahjour@ca.ibm.com>
	Tue, 1 Dec 2020 17:04:29 +0000 (12:04 -0500)
llvm/docs/Vectorizers.rst		patch \| blob \| history
llvm/docs/epilogue-vectorization-cfg.png	[new file with mode: 0644]	patch \| blob
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll	[new file with mode: 0644]	patch \| blob