[LoopVectorize] Permit vectorisation of more select(cmp(), X, Y) reduction patterns

author David Sherwood <david.sherwood@arm.com>

Wed, 4 Aug 2021 07:10:51 +0000 (08:10 +0100)

committer David Sherwood <david.sherwood@arm.com>

Mon, 11 Oct 2021 08:41:38 +0000 (09:41 +0100)
author David Sherwood <david.sherwood@arm.com>
Wed, 4 Aug 2021 07:10:51 +0000 (08:10 +0100)
committer David Sherwood <david.sherwood@arm.com>
Mon, 11 Oct 2021 08:41:38 +0000 (09:41 +0100)
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h

index 59ad0a3..c26dbc4 100644 (file)
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -36,20 +36,24 @@ class DominatorTree;
  
  /// These are the kinds of recurrences that we support.
  enum class RecurKind {
-  None,   ///< Not a recurrence.
-  Add,    ///< Sum of integers.
-  Mul,    ///< Product of integers.
-  Or,     ///< Bitwise or logical OR of integers.
-  And,    ///< Bitwise or logical AND of integers.
-  Xor,    ///< Bitwise or logical XOR of integers.
-  SMin,   ///< Signed integer min implemented in terms of select(cmp()).
-  SMax,   ///< Signed integer max implemented in terms of select(cmp()).
-  UMin,   ///< Unisgned integer min implemented in terms of select(cmp()).
-  UMax,   ///< Unsigned integer max implemented in terms of select(cmp()).
-  FAdd,   ///< Sum of floats.
-  FMul,   ///< Product of floats.
-  FMin,   ///< FP min implemented in terms of select(cmp()).
-  FMax    ///< FP max implemented in terms of select(cmp()).
+  None,       ///< Not a recurrence.
+  Add,        ///< Sum of integers.
+  Mul,        ///< Product of integers.
+  Or,         ///< Bitwise or logical OR of integers.
+  And,        ///< Bitwise or logical AND of integers.
+  Xor,        ///< Bitwise or logical XOR of integers.
+  SMin,       ///< Signed integer min implemented in terms of select(cmp()).
+  SMax,       ///< Signed integer max implemented in terms of select(cmp()).
+  UMin,       ///< Unisgned integer min implemented in terms of select(cmp()).
+  UMax,       ///< Unsigned integer max implemented in terms of select(cmp()).
+  FAdd,       ///< Sum of floats.
+  FMul,       ///< Product of floats.
+  FMin,       ///< FP min implemented in terms of select(cmp()).
+  FMax,       ///< FP max implemented in terms of select(cmp()).
+  SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop
+              ///< invariant
+  SelectFCmp  ///< Integer select(fcmp(),x,y) where one of (x,y) is loop
+              ///< invariant
  };
  
  /// The RecurrenceDescriptor is used to identify recurrences variables in a
@@ -112,12 +116,14 @@ public:
    };
  
    /// Returns a struct describing if the instruction 'I' can be a recurrence
-  /// variable of type 'Kind'. If the recurrence is a min/max pattern of
-  /// select(icmp()) this function advances the instruction pointer 'I' from the
-  /// compare instruction to the select instruction and stores this pointer in
-  /// 'PatternLastInst' member of the returned struct.
-  static InstDesc isRecurrenceInstr(Instruction *I, RecurKind Kind,
-                                    InstDesc &Prev, FastMathFlags FuncFMF);
+  /// variable of type 'Kind' for a Loop \p L and reduction PHI \p Phi.
+  /// If the recurrence is a min/max pattern of select(icmp()) this function
+  /// advances the instruction pointer 'I' from the compare instruction to the
+  /// select instruction and stores this pointer in 'PatternLastInst' member of
+  /// the returned struct.
+  static InstDesc isRecurrenceInstr(Loop *L, PHINode *Phi, Instruction *I,
+                                    RecurKind Kind, InstDesc &Prev,
+                                    FastMathFlags FuncFMF);
  
    /// Returns true if instruction I has multiple uses in Insts
    static bool hasMultipleUsesOf(Instruction *I,
@@ -135,13 +141,21 @@ public:
    static InstDesc isMinMaxPattern(Instruction *I, RecurKind Kind,
                                    const InstDesc &Prev);
  
+  /// Returns a struct describing whether the instruction is either a
+  ///   Select(ICmp(A, B), X, Y), or
+  ///   Select(FCmp(A, B), X, Y)
+  /// where one of (X, Y) is a loop invariant integer and the other is a PHI
+  /// value. \p Prev specifies the description of an already processed select
+  /// instruction, so its corresponding cmp can be matched to it.
+  static InstDesc isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi,
+                                     Instruction *I, InstDesc &Prev);
+
    /// Returns a struct describing if the instruction is a
    /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
    static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I);
  
    /// Returns identity corresponding to the RecurrenceKind.
-  static Constant *getRecurrenceIdentity(RecurKind K, Type *Tp,
-                                         FastMathFlags FMF);
+  Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF);
  
    /// Returns the opcode corresponding to the RecurrenceKind.
    static unsigned getOpcode(RecurKind Kind);
@@ -221,6 +235,12 @@ public:
      return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind);
    }
  
+  /// Returns true if the recurrence kind is of the form
+  ///   select(cmp(),x,y) where one of (x,y) is loop invariant.
+  static bool isSelectCmpRecurrenceKind(RecurKind Kind) {
+    return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp;
+  }
+
    /// Returns the type of the recurrence. This type can be narrower than the
    /// actual type of the Phi if the recurrence has been type-promoted.
    Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h

index 22316eb..eaf72a3 100644 (file)
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -351,6 +351,15 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
  /// Returns the comparison predicate used when expanding a min/max reduction.
  CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
  
+/// See RecurrenceDescriptor::isSelectCmpPattern for a description of the
+/// pattern we are trying to match. In this pattern we are only ever selecting
+/// between two values: 1) an initial PHI start value, and 2) a loop invariant
+/// value. This function uses \p LoopExitInst to determine 2), which we then use
+/// to select between \p Left and \p Right. Any lane value in \p Left that
+/// matches 2) will be merged into \p Right.
+Value *createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK,
+                         Value *Left, Value *Right);
+
  /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
  /// The Builder's fast-math-flags must be set to propagate the expected values.
  Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
@@ -378,12 +387,22 @@ Value *createSimpleTargetReduction(IRBuilderBase &B,
                                     RecurKind RdxKind,
                                     ArrayRef<Value *> RedOps = None);
  
+/// Create a target reduction of the given vector \p Src for a reduction of the
+/// kind RecurKind::SelectICmp or RecurKind::SelectFCmp. The reduction operation
+/// is described by \p Desc.
+Value *createSelectCmpTargetReduction(IRBuilderBase &B,
+                                      const TargetTransformInfo *TTI,
+                                      Value *Src,
+                                      const RecurrenceDescriptor &Desc,
+                                      PHINode *OrigPhi);
+
  /// Create a generic target reduction using a recurrence descriptor \p Desc
  /// The target is queried to determine if intrinsics or shuffle sequences are
  /// required to implement the reduction.
  /// Fast-math-flags are propagated using the RecurrenceDescriptor.
  Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI,
-                             const RecurrenceDescriptor &Desc, Value *Src);
+                             const RecurrenceDescriptor &Desc, Value *Src,
+                             PHINode *OrigPhi = nullptr);
  
  /// Create an ordered reduction intrinsic using the given recurrence
  /// descriptor \p Desc.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp

index c04083c..287e601 100644 (file)
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -62,6 +62,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
    case RecurKind::SMin:
    case RecurKind::UMax:
    case RecurKind::UMin:
+  case RecurKind::SelectICmp:
+  case RecurKind::SelectFCmp:
      return true;
    }
    return false;
@@ -327,7 +329,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
      // the starting value (the Phi or an AND instruction if the Phi has been
      // type-promoted).
      if (Cur != Start) {
-      ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, FuncFMF);
+      ReduxDesc =
+          isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF);
        if (!ReduxDesc.isRecurrence())
          return false;
        // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
@@ -360,6 +363,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
  
      // A reduction operation must only have one use of the reduction value.
      if (!IsAPhi && !IsASelect && !isMinMaxRecurrenceKind(Kind) &&
+        !isSelectCmpRecurrenceKind(Kind) &&
          hasMultipleUsesOf(Cur, VisitedInsts, 1))
        return false;
  
@@ -367,10 +371,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
      if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
        return false;
  
-    if (isIntMinMaxRecurrenceKind(Kind) &&
+    if ((isIntMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectICmp) &&
          (isa<ICmpInst>(Cur) || isa<SelectInst>(Cur)))
        ++NumCmpSelectPatternInst;
-    if (isFPMinMaxRecurrenceKind(Kind) &&
+    if ((isFPMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectFCmp) &&
          (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur)))
        ++NumCmpSelectPatternInst;
  
@@ -423,8 +427,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
                   ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                     !isa<SelectInst>(UI)) ||
                    (!isConditionalRdxPattern(Kind, UI).isRecurrence() &&
-                   !isMinMaxPattern(UI, Kind, IgnoredVal)
-                        .isRecurrence())))
+                   !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal)
+                        .isRecurrence() &&
+                   !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence())))
          return false;
  
        // Remember that we completed the cycle.
@@ -442,6 +447,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
        NumCmpSelectPatternInst != 0)
      return false;
  
+  if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
+    return false;
+
    if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
      return false;
  
@@ -508,6 +516,63 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
    return true;
  }
  
+// We are looking for loops that do something like this:
+//   int r = 0;
+//   for (int i = 0; i < n; i++) {
+//     if (src[i] > 3)
+//       r = 3;
+//   }
+// where the reduction value (r) only has two states, in this example 0 or 3.
+// The generated LLVM IR for this type of loop will be like this:
+//   for.body:
+//     %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ]
+//     ...
+//     %cmp = icmp sgt i32 %5, 3
+//     %spec.select = select i1 %cmp, i32 3, i32 %r
+//     ...
+// In general we can support vectorization of loops where 'r' flips between
+// any two non-constants, provided they are loop invariant. The only thing
+// we actually care about at the end of the loop is whether or not any lane
+// in the selected vector is different from the start value. The final
+// across-vector reduction after the loop simply involves choosing the start
+// value if nothing changed (0 in the example above) or the other selected
+// value (3 in the example above).
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi,
+                                         Instruction *I, InstDesc &Prev) {
+  // We must handle the select(cmp(),x,y) as a single instruction. Advance to
+  // the select.
+  CmpInst::Predicate Pred;
+  if (match(I, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) {
+    if (auto *Select = dyn_cast<SelectInst>(*I->user_begin()))
+      return InstDesc(Select, Prev.getRecKind());
+  }
+
+  // Only match select with single use cmp condition.
+  if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(),
+                         m_Value())))
+    return InstDesc(false, I);
+
+  SelectInst *SI = cast<SelectInst>(I);
+  Value *NonPhi = nullptr;
+
+  if (OrigPhi == dyn_cast<PHINode>(SI->getTrueValue()))
+    NonPhi = SI->getFalseValue();
+  else if (OrigPhi == dyn_cast<PHINode>(SI->getFalseValue()))
+    NonPhi = SI->getTrueValue();
+  else
+    return InstDesc(false, I);
+
+  // We are looking for selects of the form:
+  //   select(cmp(), phi, loop_invariant) or
+  //   select(cmp(), loop_invariant, phi)
+  if (!Loop->isLoopInvariant(NonPhi))
+    return InstDesc(false, I);
+
+  return InstDesc(I, isa<ICmpInst>(I->getOperand(0)) ? RecurKind::SelectICmp
+                                                     : RecurKind::SelectFCmp);
+}
+
  RecurrenceDescriptor::InstDesc
  RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
                                        const InstDesc &Prev) {
@@ -602,7 +667,8 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
  }
  
  RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind,
+RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
+                                        Instruction *I, RecurKind Kind,
                                          InstDesc &Prev, FastMathFlags FuncFMF) {
    assert(Prev.getRecKind() == RecurKind::None || Prev.getRecKind() == Kind);
    switch (I->getOpcode()) {
@@ -636,6 +702,8 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind,
    case Instruction::FCmp:
    case Instruction::ICmp:
    case Instruction::Call:
+    if (isSelectCmpRecurrenceKind(Kind))
+      return isSelectCmpPattern(L, OrigPhi, I, Prev);
      if (isIntMinMaxRecurrenceKind(Kind) ||
          (((FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) ||
            (isa<FPMathOperator>(I) && I->hasNoNaNs() &&
@@ -664,7 +732,6 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                            RecurrenceDescriptor &RedDes,
                                            DemandedBits *DB, AssumptionCache *AC,
                                            DominatorTree *DT) {
-
    BasicBlock *Header = TheLoop->getHeader();
    Function &F = *Header->getParent();
    FastMathFlags FMF;
@@ -709,6 +776,12 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
      LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
      return true;
    }
+  if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
+                      DT)) {
+    LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
+                      << *Phi << "\n");
+    return true;
+  }
    if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
      LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
      return true;
@@ -725,6 +798,12 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
      LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
      return true;
    }
+  if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
+                      DT)) {
+    LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
+                      << " PHI." << *Phi << "\n");
+    return true;
+  }
    // Not a reduction of known type.
    return false;
  }
@@ -831,8 +910,8 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
  
  /// This function returns the identity element (or neutral element) for
  /// the operation K.
-Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
-                                                      FastMathFlags FMF) {
+Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
+                                                   FastMathFlags FMF) {
    switch (K) {
    case RecurKind::Xor:
    case RecurKind::Add:
@@ -872,6 +951,10 @@ Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
      return ConstantFP::getInfinity(Tp, true);
    case RecurKind::FMax:
      return ConstantFP::getInfinity(Tp, false);
+  case RecurKind::SelectICmp:
+  case RecurKind::SelectFCmp:
+    return getRecurrenceStartValue();
+    break;
    default:
      llvm_unreachable("Unknown recurrence kind");
    }
@@ -897,9 +980,11 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
    case RecurKind::SMin:
    case RecurKind::UMax:
    case RecurKind::UMin:
+  case RecurKind::SelectICmp:
      return Instruction::ICmp;
    case RecurKind::FMax:
    case RecurKind::FMin:
+  case RecurKind::SelectFCmp:
      return Instruction::FCmp;
    default:
      llvm_unreachable("Unknown recurrence operation");
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

index 3b965c8..0c9cba2 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1981,6 +1981,8 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
    case RecurKind::UMax:
    case RecurKind::FMin:
    case RecurKind::FMax:
+  case RecurKind::SelectICmp:
+  case RecurKind::SelectFCmp:
      return true;
    default:
      return false;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp

index 78d756a..7896d55 100644 (file)
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -908,6 +908,15 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
    }
  }
  
+Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal,
+                               RecurKind RK, Value *Left, Value *Right) {
+  if (auto VTy = dyn_cast<VectorType>(Left->getType()))
+    StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp");
+  return Builder.CreateSelect(Cmp, Left, Right, "rdx.select");
+}
+
  Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                              Value *Right) {
    CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK);
@@ -988,6 +997,46 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  }
  
+Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder,
+                                            const TargetTransformInfo *TTI,
+                                            Value *Src,
+                                            const RecurrenceDescriptor &Desc,
+                                            PHINode *OrigPhi) {
+  assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+             Desc.getRecurrenceKind()) &&
+         "Unexpected reduction kind");
+  Value *InitVal = Desc.getRecurrenceStartValue();
+  Value *NewVal = nullptr;
+
+  // First use the original phi to determine the new value we're trying to
+  // select from in the loop.
+  SelectInst *SI = nullptr;
+  for (auto *U : OrigPhi->users()) {
+    if ((SI = dyn_cast<SelectInst>(U)))
+      break;
+  }
+  assert(SI && "One user of the original phi should be a select");
+
+  if (SI->getTrueValue() == OrigPhi)
+    NewVal = SI->getFalseValue();
+  else {
+    assert(SI->getFalseValue() == OrigPhi &&
+           "At least one input to the select should be the original Phi");
+    NewVal = SI->getTrueValue();
+  }
+
+  // Create a splat vector with the new value and compare this to the vector
+  // we want to reduce.
+  ElementCount EC = cast<VectorType>(Src->getType())->getElementCount();
+  Value *Right = Builder.CreateVectorSplat(EC, InitVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp");
+
+  // If any predicate is true it means that we want to select the new value.
+  Cmp = Builder.CreateOrReduce(Cmp);
+  return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select");
+}
+
  Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
                                           const TargetTransformInfo *TTI,
                                           Value *Src, RecurKind RdxKind,
@@ -1028,14 +1077,19 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
  
  Value *llvm::createTargetReduction(IRBuilderBase &B,
                                     const TargetTransformInfo *TTI,
-                                   const RecurrenceDescriptor &Desc,
-                                   Value *Src) {
+                                   const RecurrenceDescriptor &Desc, Value *Src,
+                                   PHINode *OrigPhi) {
    // TODO: Support in-order reductions based on the recurrence descriptor.
    // All ops in the reduction inherit fast-math-flags from the recurrence
    // descriptor.
    IRBuilderBase::FastMathFlagGuard FMFGuard(B);
    B.setFastMathFlags(Desc.getFastMathFlags());
-  return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
+
+  RecurKind RK = Desc.getRecurrenceKind();
+  if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
+    return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi);
+
+  return createSimpleTargetReduction(B, TTI, Src, RK);
  }
  
  Value *llvm::createOrderedReduction(IRBuilderBase &B,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 8a2f495..d6b4dce 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4414,9 +4414,11 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
        if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
          ReducedPartRdx = Builder.CreateBinOp(
              (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-      } else {
+      } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
+        ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
+                                           ReducedPartRdx, RdxPart);
+      else
          ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-      }
      }
    }
  
@@ -4424,7 +4426,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
    // target reduction in the loop using a Reduction recipe.
    if (VF.isVector() && !PhiR->isInLoop()) {
      ReducedPartRdx =
-        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
      // If the reduction can be performed in a smaller type, we need to extend
      // the reduction to the wider type before we branch to the original loop.
      if (PhiTy != RdxDesc.getRecurrenceType())
@@ -6518,6 +6520,22 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
      unsigned StoresIC = IC / (NumStores ? NumStores : 1);
      unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
  
+    // There is little point in interleaving for reductions containing selects
+    // and compares when VF=1 since it may just create more overhead than it's
+    // worth for loops with small trip counts. This is because we still have to
+    // do the final reduction after the loop.
+    bool HasSelectCmpReductions =
+        HasReductions &&
+        any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+          const RecurrenceDescriptor &RdxDesc = Reduction.second;
+          return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+              RdxDesc.getRecurrenceKind());
+        });
+    if (HasSelectCmpReductions) {
+      LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
+      return 1;
+    }
+
      // If we have a scalar reduction (vector reductions are already dealt with
      // by this point), we can increase the critical path length if the loop
      // we're interleaving is inside another loop. For tree-wise reductions
@@ -9243,6 +9261,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
        RecipeBuilder.recordRecipeOf(R);
        // For min/max reducitons, where we have a pair of icmp/select, we also
        // need to record the ICmp recipe, so it can be removed later.
+      assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
+             "Only min/max recurrences allowed for inloop reductions");
        if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
          RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
      }
@@ -9566,6 +9586,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
  
        VPValue *ChainOp = Plan->getVPValue(Chain);
        unsigned FirstOpId;
+      assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
+             "Only min/max recurrences allowed for inloop reductions");
        if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
          assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
                 "Expected to replace a VPWidenSelectSC");
@@ -9738,10 +9760,10 @@ void VPReductionRecipe::execute(VPTransformState &State) {
      if (VPValue *Cond = getCondOp()) {
        Value *NewCond = State.get(Cond, Part);
        VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+      Value *Iden = RdxDesc->getRecurrenceIdentity(
            Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
-      Constant *IdenVec =
-          ConstantVector::getSplat(VecTy->getElementCount(), Iden);
+      Value *IdenVec =
+          State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
        Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
        NewVecOp = Select;
      }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp

index fa71b1c..d5bb948 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1336,7 +1336,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
  
    Value *Iden = nullptr;
    RecurKind RK = RdxDesc.getRecurrenceKind();
-  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
+  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
      // MinMax reduction have the start value as their identify.
      if (ScalarPHI) {
        Iden = StartV;
@@ -1347,12 +1348,11 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
            Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
      }
    } else {
-    Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
-        RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags());
-    Iden = IdenC;
+    Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
+                                         RdxDesc.getFastMathFlags());
  
      if (!ScalarPHI) {
-      Iden = ConstantVector::getSplat(State.VF, IdenC);
+      Iden = Builder.CreateVectorSplat(State.VF, Iden);
        IRBuilderBase::InsertPointGuard IPBuilder(Builder);
        Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
        Constant *Zero = Builder.getInt32(0);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll

new file mode 100644 (file)

index 0000000..a8285cf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -0,0 +1,204 @@
+; RUN: opt -loop-vectorize -scalable-vectorization=preferred -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
+; RUN: opt -loop-vectorize -scalable-vectorization=preferred -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
+
+target triple = "aarch64-linux-gnu"
+
+define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
+; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+
+; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC4:      vector.body:
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1]] = select <vscale x 4 x i1> [[VEC_ICMP1]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2]] = select <vscale x 4 x i1> [[VEC_ICMP2]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3]] = select <vscale x 4 x i1> [[VEC_ICMP3]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4]] = select <vscale x 4 x i1> [[VEC_ICMP4]], <vscale x 4 x i32> [[VEC_PHI4]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4:      middle.block:
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP5]], <vscale x 4 x i32> [[VEC_SEL1]], <vscale x 4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP6]], <vscale x 4 x i32> [[VEC_SEL5]], <vscale x 4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP7]], <vscale x 4 x i32> [[VEC_SEL6]], <vscale x 4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select i1 %4, i32 %1, i32 7
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body, !llvm.loop !0
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
+; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC1:      vector.ph:
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[FIN_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+
+; CHECK-VF4IC4-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC4:      vector.body:
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ %a, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select i1 %4, i32 %1, i32 %b
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body, !llvm.loop !0
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) #0 {
+; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+
+; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC4:      vector.body:
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds float, float* %v, i64 %0
+  %3 = load float, float* %2, align 4
+  %4 = fcmp fast ueq float %3, 3.0
+  %5 = select i1 %4, i32 %1, i32 1
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body, !llvm.loop !0
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
+; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp
+; CHECK-VF4IC1-NOT: vector.body
+; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp
+; CHECK-VF4IC4-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select fast i1 %4, float %1, float 7.0
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body, !llvm.loop !0
+
+exit:                                     ; preds = %for.body
+  ret float %5
+}
+
+define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) #0 {
+; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0
+
+; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-VF4IC4:      vector.body:
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.013 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %r.012 = phi i32 [ %r.1, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %i.013
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 35
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %i.013
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp eq i32 %1, 2
+  %spec.select = select i1 %cmp3, i32 1, i32 %r.012
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %r.1 = phi i32 [ %r.012, %for.body ], [ %spec.select, %if.then ]
+  %inc = add nuw nsw i64 %i.013, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
+
+for.end.loopexit:                                 ; preds = %for.inc
+  %r.1.lcssa = phi i32 [ %r.1, %for.inc ]
+  ret i32 %r.1.lcssa
+}
+
+
+attributes #0 = { "target-features"="+sve" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll

index 9d0aa5e..9fcebe0 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
@@ -7,8 +7,8 @@ define i8 @reduction_add_trunc(i8* noalias nocapture %A) {
  ; CHECK-LABEL: @reduction_add_trunc(
  ; CHECK:       vector.body:
  ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP36:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 0, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 0, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[TMP36:%.*]], %vector.body ]
  ; CHECK:         [[TMP14:%.*]] = and <vscale x 8 x i32> [[VEC_PHI]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
  ; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 8 x i32> [[VEC_PHI1]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
  ; CHECK:         [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>*
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll

new file mode 100644 (file)

index 0000000..4e64e94
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -0,0 +1,143 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1
+; RUN: opt -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2
+
+define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) {
+; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp(
+; CHECK-VF2IC1:       vector.body:
+; CHECK-VF2IC1:         [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
+; CHECK-VF2IC1:         [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{%.*}}, align 4
+; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
+; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue
+; CHECK-VF2IC1:       pred.load.if:
+; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}}
+; CHECK-VF2IC1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-VF2IC1-NEXT:    br label %pred.load.continue
+; CHECK-VF2IC1:       pred.load.continue:
+; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ]
+; CHECK-VF2IC1-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2
+; CHECK-VF2IC1:       pred.load.if1:
+; CHECK-VF2IC1:         [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}}
+; CHECK-VF2IC1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
+; CHECK-VF2IC1-NEXT:    br label %pred.load.continue2
+; CHECK-VF2IC1:       pred.load.continue2:
+; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ]
+; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], <i32 2, i32 2>
+; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> <i32 1, i32 1>, <2 x i32> [[VEC_PHI]]
+; CHECK-VF2IC1-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]]
+; CHECK-VF2IC1:         br i1 {{%.*}}, label %middle.block, label %vector.body
+; CHECK-VF2IC1:       middle.block:
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer
+; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 0
+; CHECK-VF2IC1:       scalar.ph:
+; CHECK-VF2IC1:         [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF2IC1-NEXT:    br label %for.body
+; CHECK-VF2IC1:       for.body:
+; CHECK-VF2IC1:         [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF2IC1:         [[TMP21:%.*]] = load i32, i32* {{%.*}}, align 4
+; CHECK-VF2IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35
+; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label %if.then, label %for.inc
+; CHECK-VF2IC1:       if.then:
+; CHECK-VF2IC1:         [[TMP22:%.*]] = load i32, i32* {{%.*}}, align 4
+; CHECK-VF2IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2
+; CHECK-VF2IC1-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
+; CHECK-VF2IC1-NEXT:    br label %for.inc
+; CHECK-VF2IC1:       for.inc:
+; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
+; CHECK-VF2IC1:       for.end.loopexit:
+; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF2IC1-NEXT:    ret i32 [[R_1_LCSSA]]
+;
+; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp(
+; CHECK-VF1IC2:       vector.body:
+; CHECK-VF1IC2:         [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue4 ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue4 ]
+; CHECK-VF1IC2:         [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35
+; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue
+; CHECK-VF1IC2:       pred.load.if:
+; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-VF1IC2-NEXT:    br label %pred.load.continue
+; CHECK-VF1IC2:       pred.load.continue:
+; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP5]], label %pred.load.if3, label %pred.load.continue4
+; CHECK-VF1IC2:       pred.load.if3:
+; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-VF1IC2-NEXT:    br label %pred.load.continue4
+; CHECK-VF1IC2:       pred.load.continue4:
+; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if3 ]
+; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], i32 1, i32 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI2]]
+; CHECK-VF1IC2-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP4]], true
+; CHECK-VF1IC2-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP5]], true
+; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i32 [[TMP14]], i32 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %middle.block, label %vector.body
+; CHECK-VF1IC2:       middle.block:
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI5]]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph
+; CHECK-VF1IC2:       scalar.ph:
+; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF1IC2-NEXT:    br label %for.body
+; CHECK-VF1IC2:       for.body:
+; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ]
+; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF1IC2:         [[TMP19:%.*]] = load i32, i32* {{%.*}}, align 4
+; CHECK-VF1IC2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc
+; CHECK-VF1IC2:       if.then:
+; CHECK-VF1IC2:         [[TMP20:%.*]] = load i32, i32* {{%.*}}, align 4
+; CHECK-VF1IC2-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
+; CHECK-VF1IC2-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
+; CHECK-VF1IC2-NEXT:    br label %for.inc
+; CHECK-VF1IC2:       for.inc:
+; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %for.body
+; CHECK-VF1IC2:       for.end.loopexit:
+; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF1IC2-NEXT:    ret i32 [[R_1_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.013 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %r.012 = phi i32 [ %r.1, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %i.013
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 35
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %i.013
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp eq i32 %1, 2
+  %spec.select = select i1 %cmp3, i32 1, i32 %r.012
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %r.1 = phi i32 [ %r.012, %for.body ], [ %spec.select, %if.then ]
+  %inc = add nuw nsw i64 %i.013, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.inc
+  %r.1.lcssa = phi i32 [ %r.1, %for.inc ]
+  ret i32 %r.1.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll

new file mode 100644 (file)

index 0000000..f4f129e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -0,0 +1,345 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK
+; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK
+; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 --check-prefix=CHECK
+
+define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) {
+; CHECK-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+
+; CHECK-VF4IC4:      vector.body:
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4:      middle.block:
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <4 x i32> [[VEC_SEL1]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <4 x i32> [[VEC_SEL5]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <4 x i32> [[VEC_SEL6]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+
+
+; CHECK-VF1IC4:      vector.body:
+; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF1IC4:        [[VEC_LOAD1:%.*]] = load i32
+; CHECK-VF1IC4-NEXT:   [[VEC_LOAD2:%.*]] = load i32
+; CHECK-VF1IC4-NEXT:   [[VEC_LOAD3:%.*]] = load i32
+; CHECK-VF1IC4-NEXT:   [[VEC_LOAD4:%.*]] = load i32
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP1:%.*]] = icmp eq i32 [[VEC_LOAD1]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7
+; CHECK-VF1IC4:      middle.block:
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp ne i32 [[VEC_SEL1]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]]
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne i32 [[VEC_SEL5]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]]
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne i32 [[VEC_SEL6]], 3
+; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]]
+
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select i1 %4, i32 %1, i32 7
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+
+define i32 @select_const_i32_from_icmp2(i32* nocapture readonly %v, i64 %n) {
+; CHECK-LABEL: @select_const_i32_from_icmp2
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select i1 %4, i32 7, i32 %1
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+
+define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
+; CHECK-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC1:      vector.ph:
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
+; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ %a, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select i1 %4, i32 %1, i32 %b
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+
+define i32 @select_const_i32_from_fcmp_fast(float* nocapture readonly %v, i64 %n) {
+; CHECK-LABEL: @select_const_i32_from_fcmp_fast
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
+; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds float, float* %v, i64 %0
+  %3 = load float, float* %2, align 4
+  %4 = fcmp fast ueq float %3, 3.0
+  %5 = select i1 %4, i32 %1, i32 1
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+
+define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) {
+; CHECK-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
+; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds float, float* %v, i64 %0
+  %3 = load float, float* %2, align 4
+  %4 = fcmp ueq float %3, 3.0
+  %5 = select i1 %4, i32 %1, i32 1
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %5
+}
+
+
+define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
+; CHECK-LABEL: @select_i32_from_icmp_same_inputs
+; CHECK-VF4IC1:      vector.ph:
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1:      vector.body:
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_PHI]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1:      middle.block:
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %4, %for.body ]
+  %1 = phi i32 [ %a, %entry ], [ %3, %for.body ]
+  %2 = icmp eq i32 %1, 3
+  %3 = select i1 %2, i32 %1, i32 %b
+  %4 = add nuw nsw i64 %0, 1
+  %5 = icmp eq i64 %4, %n
+  br i1 %5, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %3
+}
+
+
+; Negative tests
+
+; We don't support FP reduction variables at the moment.
+define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) {
+; CHECK: @select_const_f32_from_icmp
+; CHECK-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
+  %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = icmp eq i32 %3, 3
+  %5 = select fast i1 %4, float %1, float 7.0
+  %6 = add nuw nsw i64 %0, 1
+  %7 = icmp eq i64 %6, %n
+  br i1 %7, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret float %5
+}
+
+
+; We don't support select/cmp reduction patterns where there is more than one
+; use of the icmp/fcmp.
+define i32 @select_const_i32_from_icmp_mul_use(i32* nocapture readonly %v1, i32* %v2, i64 %n) {
+; CHECK-LABEL: @select_const_i32_from_icmp_mul_use
+; CHECK-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %8, %for.body ]
+  %1 = phi i32 [ 3, %entry ], [ %6, %for.body ]
+  %2 = phi i32 [ 0, %entry ], [ %7, %for.body ]
+  %3 = getelementptr inbounds i32, i32* %v1, i64 %0
+  %4 = load i32, i32* %3, align 4
+  %5 = icmp eq i32 %4, 3
+  %6 = select i1 %5, i32 %1, i32 7
+  %7 = zext i1 %5 to i32
+  %8 = add nuw nsw i64 %0, 1
+  %9 = icmp eq i64 %8, %n
+  br i1 %9, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  store i32 %7, i32* %v2, align 4
+  ret i32 %6
+}
+
+
+; We don't support selecting loop-variant values.
+define i32 @select_variant_i32_from_icmp(i32* nocapture readonly %v1, i32* nocapture readonly %v2, i64 %n) {
+; CHECK-LABEL: @select_variant_i32_from_icmp
+; CHECK-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i64 [ 0, %entry ], [ %8, %for.body ]
+  %1 = phi i32 [ 3, %entry ], [ %7, %for.body ]
+  %2 = getelementptr inbounds i32, i32* %v1, i64 %0
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %v2, i64 %0
+  %5 = load i32, i32* %4, align 4
+  %6 = icmp eq i32 %3, 3
+  %7 = select i1 %6, i32 %1, i32 %5
+  %8 = add nuw nsw i64 %0, 1
+  %9 = icmp eq i64 %8, %n
+  br i1 %9, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %7
+}
+
+
+; We only support selects where the input comes from the same PHI as the
+; reduction PHI. In the example below, the select uses the induction
+; variable input and the icmp uses the reduction PHI.
+define i32 @select_i32_from_icmp_non_redux_phi(i32 %a, i32 %b, i32 %n) {
+; CHECK-LABEL: @select_i32_from_icmp_non_redux_phi
+; CHECK-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %entry, %for.body
+  %0 = phi i32 [ 0, %entry ], [ %4, %for.body ]
+  %1 = phi i32 [ %a, %entry ], [ %3, %for.body ]
+  %2 = icmp eq i32 %1, 3
+  %3 = select i1 %2, i32 %0, i32 %b
+  %4 = add nuw nsw i32 %0, 1
+  %5 = icmp eq i32 %4, %n
+  br i1 %5, label %exit, label %for.body
+
+exit:                                     ; preds = %for.body
+  ret i32 %3
+}
author	David Sherwood <david.sherwood@arm.com>
	Wed, 4 Aug 2021 07:10:51 +0000 (08:10 +0100)
committer	David Sherwood <david.sherwood@arm.com>
	Mon, 11 Oct 2021 08:41:38 +0000 (09:41 +0100)
llvm/include/llvm/Analysis/IVDescriptors.h		patch \| blob \| history
llvm/include/llvm/Transforms/Utils/LoopUtils.h		patch \| blob \| history
llvm/lib/Analysis/IVDescriptors.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Transforms/Utils/LoopUtils.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/VPlan.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/select-cmp.ll	[new file with mode: 0644]	patch \| blob