[SLP]Do not reduce repeated values, use scalar red ops instead.

author Alexey Bataev <a.bataev@outlook.com>

Fri, 17 Jun 2022 17:15:23 +0000 (10:15 -0700)

committer Alexey Bataev <a.bataev@outlook.com>

Fri, 17 Feb 2023 15:19:35 +0000 (07:19 -0800)
author Alexey Bataev <a.bataev@outlook.com>
Fri, 17 Jun 2022 17:15:23 +0000 (10:15 -0700)
committer Alexey Bataev <a.bataev@outlook.com>
Fri, 17 Feb 2023 15:19:35 +0000 (07:19 -0800)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

index f1aa795..92ca124 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -126,6 +126,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
      cl::desc(
          "Attempt to vectorize horizontal reductions feeding into a store"));
  
+// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
+// even if we match a reduction but do not vectorize in the end.
+static cl::opt<bool> AllowHorRdxIdenityOptimization(
+    "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
+    cl::desc("Allow optimization of original scalar identity operations on "
+             "matched horizontal reductions."));
+
  static cl::opt<int>
  MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
      cl::desc("Attempt to vectorize for this register size in bits"));
@@ -11856,6 +11863,9 @@ class HorizontalReduction {
    WeakTrackingVH ReductionRoot;
    /// The type of reduction operation.
    RecurKind RdxKind;
+  /// Checks if the optimization of original scalar identity operations on
+  /// matched horizontal reductions is enabled and allowed.
+  bool IsSupportedHorRdxIdentityOp = false;
  
    static bool isCmpSelMinMax(Instruction *I) {
      return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -12231,7 +12241,7 @@ public:
          if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
              IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
              !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
-            !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+            !isVectorizable(RdxKind, EdgeInst)) {
            PossibleReducedVals.push_back(EdgeVal);
            continue;
          }
@@ -12406,14 +12416,18 @@ public:
      // If there are a sufficient number of reduction values, reduce
      // to a nearby power-of-2. We can safely generate oversized
      // vectors and rely on the backend to split them to legal sizes.
-    size_t NumReducedVals =
+    unsigned NumReducedVals =
          std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
-                        [](size_t Num, ArrayRef<Value *> Vals) {
+                        [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
                            if (!isGoodForReduction(Vals))
                              return Num;
                            return Num + Vals.size();
                          });
-    if (NumReducedVals < ReductionLimit) {
+    if (NumReducedVals < ReductionLimit &&
+        (!AllowHorRdxIdenityOptimization ||
+         all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
+           return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
+         }))) {
        for (ReductionOpsType &RdxOps : ReductionOps)
          for (Value *RdxOp : RdxOps)
            V.analyzedReductionRoot(cast<Instruction>(RdxOp));
@@ -12447,6 +12461,18 @@ public:
        return cast<Instruction>(ScalarCond);
      };
  
+    // Return new VectorizedTree, based on previous value.
+    auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
+      if (VectorizedTree) {
+        // Update the final value in the reduction.
+        Builder.SetCurrentDebugLocation(
+            cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+        return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
+                        ReductionOps);
+      }
+      // Initialize the final value in the reduction.
+      return Res;
+    };
      // The reduction root is used as the insertion point for new instructions,
      // so set it as externally used to prevent it from being deleted.
      ExternallyUsedValues[ReductionRoot];
@@ -12518,10 +12544,73 @@ public:
            }
          }
        }
+
+      // Emit code for constant values.
+      if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
+          allConstant(Candidates)) {
+        Value *Res = Candidates.front();
+        ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
+        for (Value *V : ArrayRef(Candidates).drop_front()) {
+          Res = createOp(Builder, RdxKind, Res, V, "const.rdx", ReductionOps);
+          ++VectorizedVals.try_emplace(V, 0).first->getSecond();
+        }
+        VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
+        continue;
+      }
+
        unsigned NumReducedVals = Candidates.size();
-      if (NumReducedVals < ReductionLimit)
+      if (NumReducedVals < ReductionLimit &&
+          (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
+           !isSplat(Candidates)))
          continue;
  
+      // Check if we support repeated scalar values processing (optimization of
+      // original scalar identity operations on matched horizontal reductions).
+      IsSupportedHorRdxIdentityOp =
+          AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
+          RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
+      // Gather same values.
+      MapVector<Value *, unsigned> SameValuesCounter;
+      if (IsSupportedHorRdxIdentityOp)
+        for (Value *V : Candidates)
+          ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
+      // Used to check if the reduced values used same number of times. In this
+      // case the compiler may produce better code. E.g. if reduced values are
+      // aabbccdd (8 x values), then the first node of the tree will have a node
+      // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
+      // Plus, the final reduction will be performed on <8 x aabbccdd>.
+      // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
+      // x abcd) * 2.
+      // Currently it only handles add/fadd/xor. and/or/min/max do not require
+      // this analysis, other operations may require an extra estimation of
+      // the profitability.
+      bool SameScaleFactor = false;
+      bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
+                              SameValuesCounter.size() != Candidates.size();
+      if (OptReusedScalars) {
+        SameScaleFactor =
+            (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
+             RdxKind == RecurKind::Xor) &&
+            all_of(drop_begin(SameValuesCounter),
+                   [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
+                     return P.second == SameValuesCounter.front().second;
+                   });
+        Candidates.resize(SameValuesCounter.size());
+        transform(SameValuesCounter, Candidates.begin(),
+                  [](const auto &P) { return P.first; });
+        NumReducedVals = Candidates.size();
+        // Have a reduction of the same element.
+        if (NumReducedVals == 1) {
+          Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
+          unsigned Cnt = SameValuesCounter.lookup(OrigV);
+          Value *RedVal =
+              emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
+          VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
+          VectorizedVals.try_emplace(OrigV, Cnt);
+          continue;
+        }
+      }
+
        unsigned MaxVecRegSize = V.getMaxVecRegSize();
        unsigned EltSize = V.getVectorElementSize(Candidates[0]);
        unsigned MaxElts =
@@ -12551,6 +12640,7 @@ public:
          ReduxWidth /= 2;
          return IsAnyRedOpGathered;
        };
+      bool AnyVectorized = false;
        while (Pos < NumReducedVals - ReduxWidth + 1 &&
               ReduxWidth >= ReductionLimit) {
          // Dependency in tree of the reduction ops - drop this attempt, try
@@ -12603,34 +12693,24 @@ public:
                         LocalExternallyUsedValues[TrackedVals[V]];
                     });
          }
-        // Number of uses of the candidates in the vector of values.
-        SmallDenseMap<Value *, unsigned> NumUses(Candidates.size());
-        for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
-          Value *V = Candidates[Cnt];
-          ++NumUses.try_emplace(V, 0).first->getSecond();
-        }
-        for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
-          Value *V = Candidates[Cnt];
-          ++NumUses.try_emplace(V, 0).first->getSecond();
+        if (!IsSupportedHorRdxIdentityOp) {
+          // Number of uses of the candidates in the vector of values.
+          assert(SameValuesCounter.empty() &&
+                 "Reused values counter map is not empty");
+          for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+            if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
+              continue;
+            Value *V = Candidates[Cnt];
+            Value *OrigV = TrackedToOrig.find(V)->second;
+            ++SameValuesCounter[OrigV];
+          }
          }
          SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
          // Gather externally used values.
          SmallPtrSet<Value *, 4> Visited;
-        for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {
-          Value *RdxVal = Candidates[Cnt];
-          if (!Visited.insert(RdxVal).second)
-            continue;
-          // Check if the scalar was vectorized as part of the vectorization
-          // tree but not the top node.
-          if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
-            LocalExternallyUsedValues[RdxVal];
+        for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+          if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
              continue;
-          }
-          unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
-          if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
-            LocalExternallyUsedValues[RdxVal];
-        }
-        for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {
            Value *RdxVal = Candidates[Cnt];
            if (!Visited.insert(RdxVal).second)
              continue;
@@ -12640,10 +12720,15 @@ public:
              LocalExternallyUsedValues[RdxVal];
              continue;
            }
-          unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal];
+          Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+          unsigned NumOps =
+              VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
            if (NumOps != ReducedValsToOps.find(RdxVal)->second.size())
              LocalExternallyUsedValues[RdxVal];
          }
+        // Do not need the list of reused scalars in regular mode anymore.
+        if (!IsSupportedHorRdxIdentityOp)
+          SameValuesCounter.clear();
          for (Value *RdxVal : VL)
            if (RequiredExtract.contains(RdxVal))
              LocalExternallyUsedValues[RdxVal];
@@ -12727,29 +12812,47 @@ public:
          if (isBoolLogicOp(RdxRootInst))
            VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
  
+        // Emit code to correctly handle reused reduced values, if required.
+        if (OptReusedScalars && !SameScaleFactor) {
+          VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, VL,
+                                         SameValuesCounter, TrackedToOrig);
+        }
+
          Value *ReducedSubTree =
              emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
  
-        if (!VectorizedTree) {
-          // Initialize the final value in the reduction.
-          VectorizedTree = ReducedSubTree;
-        } else {
-          // Update the final value in the reduction.
-          Builder.SetCurrentDebugLocation(
-              cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
-          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                    ReducedSubTree, "op.rdx", ReductionOps);
-        }
+        // Improved analysis for add/fadd/xor reductions with same scale factor
+        // for all operands of reductions. We can emit scalar ops for them
+        // instead.
+        if (OptReusedScalars && SameScaleFactor)
+          ReducedSubTree = emitScaleForReusedOps(
+              ReducedSubTree, Builder, SameValuesCounter.front().second);
+
+        VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
          // Count vectorized reduced values to exclude them from final reduction.
          for (Value *RdxVal : VL) {
-          ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0)
-                .first->getSecond();
+          Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+          if (IsSupportedHorRdxIdentityOp) {
+            VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]);
+            continue;
+          }
+          ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
            if (!V.isVectorized(RdxVal))
              RequiredExtract.insert(RdxVal);
          }
          Pos += ReduxWidth;
          Start = Pos;
          ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
+        AnyVectorized = true;
+      }
+      if (OptReusedScalars && !AnyVectorized) {
+        for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
+          Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
+          VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
+          Value *OrigV = TrackedToOrig.find(P.first)->second;
+          VectorizedVals.try_emplace(OrigV, P.second);
+        }
+        continue;
        }
      }
      if (VectorizedTree) {
@@ -12977,8 +13080,144 @@ private:
      ++NumVectorInstructions;
      return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
    }
-};
  
+  /// Emits optimized code for unique scalar value reused \p Cnt times.
+  Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
+                               unsigned Cnt) {
+    assert(IsSupportedHorRdxIdentityOp &&
+           "The optimization of matched scalar identity horizontal reductions "
+           "must be supported.");
+    switch (RdxKind) {
+    case RecurKind::Add: {
+      // res = mul vv, n
+      Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
+      LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
+                        << VectorizedValue << ". (HorRdx)\n");
+      return Builder.CreateMul(VectorizedValue, Scale);
+    }
+    case RecurKind::Xor: {
+      // res = n % 2 ? 0 : vv
+      LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
+                        << ". (HorRdx)\n");
+      if (Cnt % 2 == 0)
+        return Constant::getNullValue(VectorizedValue->getType());
+      return VectorizedValue;
+    }
+    case RecurKind::FAdd: {
+      // res = fmul v, n
+      Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
+      LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
+                        << VectorizedValue << ". (HorRdx)\n");
+      return Builder.CreateFMul(VectorizedValue, Scale);
+    }
+    case RecurKind::And:
+    case RecurKind::Or:
+    case RecurKind::SMax:
+    case RecurKind::SMin:
+    case RecurKind::UMax:
+    case RecurKind::UMin:
+    case RecurKind::FMax:
+    case RecurKind::FMin:
+      // res = vv
+      return VectorizedValue;
+    case RecurKind::Mul:
+    case RecurKind::FMul:
+    case RecurKind::FMulAdd:
+    case RecurKind::SelectICmp:
+    case RecurKind::SelectFCmp:
+    case RecurKind::None:
+      llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+    }
+    return nullptr;
+  }
+
+  /// Emits actual operation for the scalar identity values, found during
+  /// horizontal reduction analysis.
+  Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
+                       ArrayRef<Value *> VL,
+                       const MapVector<Value *, unsigned> &SameValuesCounter,
+                       const DenseMap<Value *, Value *> &TrackedToOrig) {
+    assert(IsSupportedHorRdxIdentityOp &&
+           "The optimization of matched scalar identity horizontal reductions "
+           "must be supported.");
+    switch (RdxKind) {
+    case RecurKind::Add: {
+      // root = mul prev_root, <1, 1, n, 1>
+      SmallVector<Constant *> Vals;
+      for (Value *V : VL) {
+        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+        Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
+      }
+      auto *Scale = ConstantVector::get(Vals);
+      LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
+                        << VectorizedValue << ". (HorRdx)\n");
+      return Builder.CreateMul(VectorizedValue, Scale);
+    }
+    case RecurKind::And:
+    case RecurKind::Or:
+      // No need for multiple or/and(s).
+      LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
+                        << ". (HorRdx)\n");
+      return VectorizedValue;
+    case RecurKind::SMax:
+    case RecurKind::SMin:
+    case RecurKind::UMax:
+    case RecurKind::UMin:
+    case RecurKind::FMax:
+    case RecurKind::FMin:
+      // No need for multiple min/max(s) of the same value.
+      LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
+                        << ". (HorRdx)\n");
+      return VectorizedValue;
+    case RecurKind::Xor: {
+      // Replace values with even number of repeats with 0, since
+      // x xor x = 0.
+      // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
+      // 7>, if elements 4th and 6th elements have even number of repeats.
+      SmallVector<int> Mask(
+          cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
+          UndefMaskElem);
+      std::iota(Mask.begin(), Mask.end(), 0);
+      bool NeedShuffle = false;
+      for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+        Value *V = VL[I];
+        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+        if (Cnt % 2 == 0) {
+          Mask[I] = VF;
+          NeedShuffle = true;
+        }
+      }
+      LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
+                                              : Mask) dbgs()
+                                         << I << " ";
+                 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
+      if (NeedShuffle)
+        VectorizedValue = Builder.CreateShuffleVector(
+            VectorizedValue,
+            ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
+      return VectorizedValue;
+    }
+    case RecurKind::FAdd: {
+      // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
+      SmallVector<Constant *> Vals;
+      for (Value *V : VL) {
+        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+        Vals.push_back(ConstantFP::get(V->getType(), Cnt));
+      }
+      auto *Scale = ConstantVector::get(Vals);
+      return Builder.CreateFMul(VectorizedValue, Scale);
+    }
+    case RecurKind::Mul:
+    case RecurKind::FMul:
+    case RecurKind::FMulAdd:
+    case RecurKind::SelectICmp:
+    case RecurKind::SelectFCmp:
+    case RecurKind::None:
+      llvm_unreachable("Unexpected reduction kind for reused scalars.");
+    }
+    return nullptr;
+  }
+};
  } // end anonymous namespace
  
  static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll

index 90adfa3..d36da8d 100644 (file)
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
@@ -18,16 +18,15 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
  ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
  ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
  ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
-; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
-; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
-; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
-; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
-; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
-; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
-; CHECK-NEXT:    [[DOTSCALAR8:%.*]] = add i32 [[DOTSCALAR7]], 317425
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[DOTSCALAR8]], i64 0
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
+; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
+; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
  ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
  ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
  ; CHECK-NEXT:    ret <4 x i32> [[ADD29]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll

index 0b40774..dae3b59 100644 (file)
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
@@ -18,16 +18,15 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
  ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
  ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
  ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
-; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
-; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
-; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
-; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
-; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
-; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
-; CHECK-NEXT:    [[DOTSCALAR8:%.*]] = add i32 [[DOTSCALAR7]], 317425
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[DOTSCALAR8]], i64 0
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
+; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
+; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[OP_RDX15]], i64 0
  ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
  ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
  ; CHECK-NEXT:    ret <4 x i32> [[ADD29]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll

index 32d8461..2c41780 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll
@@ -1,18 +1,28 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
  ; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=arm64-apple-macosx | FileCheck %s
+; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=arm64-apple-macosx -slp-optimize-identity-hor-reduction-ops=false | FileCheck %s --check-prefix=NO-IDENTITY
  
  define i8 @test() {
  ; CHECK-LABEL: @test(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[TMP0:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
  ; CHECK-NEXT:    [[CALL278:%.*]] = call i32 @fn(i32 [[SUM]])
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[CALL278]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[SHUFFLE]])
+; CHECK-NEXT:    [[TMP0]] = mul i32 [[CALL278]], 8
  ; CHECK-NEXT:    br label [[FOR_BODY]]
  ;
+; NO-IDENTITY-LABEL: @test(
+; NO-IDENTITY-NEXT:  entry:
+; NO-IDENTITY-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-IDENTITY:       for.body:
+; NO-IDENTITY-NEXT:    [[SUM:%.*]] = phi i32 [ [[TMP2:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-IDENTITY-NEXT:    [[CALL278:%.*]] = call i32 @fn(i32 [[SUM]])
+; NO-IDENTITY-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[CALL278]], i32 0
+; NO-IDENTITY-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer
+; NO-IDENTITY-NEXT:    [[TMP2]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
+; NO-IDENTITY-NEXT:    br label [[FOR_BODY]]
+;
  entry:
    br label %for.body
  
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll

index 8400b0b..a1babad 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -5,56 +5,36 @@
  define void @Test(i32) {
  ; CHECK-LABEL: @Test(
  ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE7:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer
  ; CHECK-NEXT:    br label [[LOOP:%.*]]
  ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE7]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE8]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP0]]
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[SHUFFLE6:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP9]], [[SHUFFLE6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[TMP9]], [[SHUFFLE6]]
-; CHECK-NEXT:    [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
  ; CHECK-NEXT:    br label [[LOOP]]
  ;
  ; FORCE_REDUCTION-LABEL: @Test(
  ; FORCE_REDUCTION-NEXT:  entry:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer
  ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
  ; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]])
-; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]])
-; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]]
-; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]])
-; FORCE_REDUCTION-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]]
-; FORCE_REDUCTION-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]]
-; FORCE_REDUCTION-NEXT:    [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP4]]
-; FORCE_REDUCTION-NEXT:    [[VAL_43:%.*]] = add i32 [[TMP4]], 14910
-; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP10]] = insertelement <2 x i32> [[TMP9]], i32 [[VAL_43]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]])
+; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP5]]
+; FORCE_REDUCTION-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]]
+; FORCE_REDUCTION-NEXT:    [[VAL_43:%.*]] = add i32 [[TMP3]], 14910
+; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[VAL_43]], i32 1
  ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
  ;
  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll

index a5130cc..953d8b6 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll
@@ -5,14 +5,8 @@ define float @test() {
  ; CHECK-LABEL: @test(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[DOTOBIT1683:%.*]] = extractvalue { i64, i1 } zeroinitializer, 1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[DOTOBIT1683]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i1 [[OP_RDX]], false
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i1 [[OP_RDX1]], false
-; CHECK-NEXT:    br i1 [[OP_RDX2]], label [[EXIT1:%.*]], label [[EXIT2:%.*]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 false, [[DOTOBIT1683]]
+; CHECK-NEXT:    br i1 [[OP_RDX]], label [[EXIT1:%.*]], label [[EXIT2:%.*]]
  ; CHECK:       exit2:
  ; CHECK-NEXT:    ret float 0.000000e+00
  ; CHECK:       exit1:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll

index 8992218..c5f72f2 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -14,13 +14,11 @@ define i32 @crash_reordering_undefs() {
  ; CHECK-NEXT:    [[OR1:%.*]] = or i64 undef, undef
  ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
  ; CHECK-NEXT:    [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP0]], undef
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[ADD0]], [[ADD2]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[ADD4]], [[ADD9]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX2]]
-; CHECK-NEXT:    ret i32 [[OP_RDX4]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
+; CHECK-NEXT:    ret i32 [[OP_RDX3]]
  ;
  entry:
    %or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll

index c3b9876..5d9c6b6 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -6,16 +6,18 @@ define i64 @foo(i32 %tmp7) {
  ; CHECK-NEXT:  bb:
  ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[TMP7:%.*]], i32 2
  ; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 undef, i32 6
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 0>, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 0>, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sub i32 undef, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, i32 [[TMP24]], i32 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP11]], 0
  ; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64
  ; CHECK-NEXT:    ret i64 [[TMP64]]
  ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll b/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll

index 79c7b05..b4ebf5a 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll
@@ -14,12 +14,11 @@ define i1 @test(ptr %p1, ptr %p2, ptr %p3, i1 %c) #0 {
  ; CHECK-NEXT:    [[UMIN33:%.*]] = select i1 [[L1]], ptr [[SCEVGEP31]], ptr [[P2]]
  ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[P3:%.*]], [[UMIN]]
  ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[C:%.*]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[C]]
  ; CHECK-NEXT:    [[BOUND042:%.*]] = icmp ugt ptr [[P3]], [[UMIN33]]
  ; CHECK-NEXT:    [[FOUND_CONFLICT44:%.*]] = and i1 [[BOUND042]], [[C]]
-; CHECK-NEXT:    [[CONFLICT_RDX45:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT44]]
-; CHECK-NEXT:    [[CONFLICT_RDX49:%.*]] = or i1 [[CONFLICT_RDX45]], [[C]]
-; CHECK-NEXT:    ret i1 [[CONFLICT_RDX49]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[C]], [[FOUND_CONFLICT]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i1 [[OP_RDX]], [[FOUND_CONFLICT44]]
+; CHECK-NEXT:    ret i1 [[OP_RDX1]]
  ;
    %l0 = icmp ult ptr %p2, %p1
    %umin = select i1 %l0, ptr %p2, ptr %p1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll

index d0fe649..482334e 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
@@ -13,30 +13,25 @@ define i32 @foo(i32 %a) {
  ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3
  ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
  ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1
-; CHECK-NEXT:    [[SHUFFLE15:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE15]])
-; CHECK-NEXT:    [[OP_RDX16:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[OP_RDX17:%.*]] = add i32 [[OP_RDX16]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i32> [[TMP5]], <i32 3, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX11]], 0
  ; CHECK-NEXT:    br label [[BB3:%.*]]
  ; CHECK:       bb2:
  ; CHECK-NEXT:    br label [[BB3]]
  ; CHECK:       bb3:
-; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX17]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX12]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
  ; CHECK-NEXT:    ret i32 0
  ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[SHUFFLE10:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE10]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
-; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX13]], [[TMP2]]
-; CHECK-NEXT:    ret i32 [[OP_RDX14]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add i32 [[TMP11]], 0
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[OP_RDX9]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[OP_RDX10]]
  ; CHECK:       bb5:
  ; CHECK-NEXT:    br label [[BB4:%.*]]
  ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll

index 6ab1f86..e3dc675 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -16,12 +16,12 @@ define float @baz() {
  ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
  ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
  ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; CHECK-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
-; CHECK-NEXT:    ret float [[OP_RDX1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[CONV]], 2.000000e+00
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store float [[OP_RDX]], ptr @res, align 4
+; CHECK-NEXT:    ret float [[OP_RDX]]
  ;
  ; THRESHOLD-LABEL: @baz(
  ; THRESHOLD-NEXT:  entry:
@@ -31,12 +31,15 @@ define float @baz() {
  ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
  ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
  ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; THRESHOLD-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
-; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], <float 2.000000e+00, float 2.000000e+00>
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; THRESHOLD-NEXT:    store float [[OP_RDX]], ptr @res, align 4
+; THRESHOLD-NEXT:    ret float [[OP_RDX]]
  ;
  entry:
    %0 = load i32, ptr @n, align 4
@@ -315,22 +318,22 @@ entry:
  define float @f(ptr nocapture readonly %x) {
  ; CHECK-LABEL: @f(
  ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
  ; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
  ; CHECK-NEXT:    ret float [[OP_RDX]]
  ;
  ; THRESHOLD-LABEL: @f(
  ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
  ; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
  ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
  ;
    entry:
@@ -484,18 +487,18 @@ define float @f1(ptr nocapture readonly %x, i32 %a, i32 %b) {
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
  ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[CONV]]
  ; CHECK-NEXT:    ret float [[OP_RDX]]
  ;
  ; THRESHOLD-LABEL: @f1(
  ; THRESHOLD-NEXT:  entry:
  ; THRESHOLD-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
  ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[CONV]]
  ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
  ;
    entry:
@@ -603,43 +606,43 @@ define float @loadadd31(ptr nocapture readonly %x) {
  ; CHECK-LABEL: @loadadd31(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
  ; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
  ; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
  ; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
  ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
  ; CHECK-NEXT:    ret float [[OP_RDX3]]
  ;
  ; THRESHOLD-LABEL: @loadadd31(
  ; THRESHOLD-NEXT:  entry:
  ; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
  ; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
  ; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
  ; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
  ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
-; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
+; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
  ; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
  ;
    entry:
@@ -740,27 +743,23 @@ define float @extra_args(ptr nocapture readonly %x, i32 %a, i32 %b) {
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
  ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[CONV]], 3.000000e+00
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    ret float [[OP_RDX2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
+; CHECK-NEXT:    ret float [[OP_RDX1]]
  ;
  ; THRESHOLD-LABEL: @extra_args(
  ; THRESHOLD-NEXT:  entry:
  ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
  ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[TMP2]], i32 0
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0
-; THRESHOLD-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP3]], [[SHUFFLE]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
-; THRESHOLD-NEXT:    ret float [[OP_RDX2]]
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
+; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
  ;
    entry:
    %mul = mul nsw i32 %b, %a
@@ -798,29 +797,28 @@ define float @extra_args_same_several_times(ptr nocapture readonly %x, i32 %a, i
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
  ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[CONV]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], 8.000000e+00
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[OP_RDX1]]
-; CHECK-NEXT:    ret float [[OP_RDX3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]]
+; CHECK-NEXT:    ret float [[OP_RDX1]]
  ;
  ; THRESHOLD-LABEL: @extra_args_same_several_times(
  ; THRESHOLD-NEXT:  entry:
  ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
  ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONV]], i32 1
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> <float 8.000000e+00, float poison>, float [[CONV]], i32 1
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[CONV]], i32 1
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float 1.300000e+01, float 2.000000e+00>
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP3]], <float 1.300000e+01, float 2.000000e+00>
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 3>
  ; THRESHOLD-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
  ; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
-; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
  ;
    entry:
    %mul = mul nsw i32 %b, %a
@@ -861,31 +859,26 @@ define float @extra_args_no_replace(ptr nocapture readonly %x, i32 %a, i32 %b, i
  ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
  ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
  ; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[CONV]], [[CONVC]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
-; CHECK-NEXT:    ret float [[OP_RDX3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONVC]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
+; CHECK-NEXT:    ret float [[OP_RDX2]]
  ;
  ; THRESHOLD-LABEL: @extra_args_no_replace(
  ; THRESHOLD-NEXT:  entry:
  ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
  ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
  ; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONVC]], i32 1
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0
-; THRESHOLD-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> zeroinitializer
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], [[SHUFFLE]]
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
-; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
-; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONVC]]
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
+; THRESHOLD-NEXT:    ret float [[OP_RDX2]]
  ;
    entry:
    %mul = mul nsw i32 %b, %a
@@ -976,32 +969,32 @@ define i32 @wobble(i32 %arg, i32 %bar) {
  ; CHECK-LABEL: @wobble(
  ; CHECK-NEXT:  bb:
  ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[ARG]]
-; CHECK-NEXT:    ret i32 [[OP_RDX2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]]
+; CHECK-NEXT:    ret i32 [[OP_RDX1]]
  ;
  ; THRESHOLD-LABEL: @wobble(
  ; THRESHOLD-NEXT:  bb:
  ; THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG:%.*]], i32 0
-; THRESHOLD-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
-; THRESHOLD-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP6]], [[TMP3]]
-; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[ARG]]
-; THRESHOLD-NEXT:    ret i32 [[OP_RDX2]]
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32>
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]]
+; THRESHOLD-NEXT:    ret i32 [[OP_RDX1]]
  ;
    bb:
    %x1 = xor i32 %arg, %bar
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll

index b9f1a42..6cf4a23 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
@@ -62,13 +62,9 @@ define void @test_2(ptr addrspace(1) %arg, i32 %arg1) #0 {
  ; CHECK:       bb2:
  ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB:%.*]] ], [ undef, [[BB2]] ]
  ; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 0, [[BB]] ], [ undef, [[BB2]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef
-; CHECK-NEXT:    call void @use(i32 [[OP_RDX1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[TMP]], 8
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 undef, [[TMP0]]
+; CHECK-NEXT:    call void @use(i32 [[OP_RDX]])
  ; CHECK-NEXT:    br label [[BB2]]
  ;
  bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll

index cb4e3bd..9d52dfa 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,48 +6,21 @@ define i16 @test() {
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
  ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
-; CHECK-NEXT:    [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8
  ; CHECK-NEXT:    br label [[WHILE:%.*]]
  ; CHECK:       while:
-; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX12:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ]
  ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i64> poison, i64 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP8]], <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i64> [[TMP9]], <16 x i64> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i64> [[TMP11]], <16 x i64> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 17, i32 17, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i64> [[TMP13]], i64 [[TMP0]], i32 9
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i64> [[TMP14]], i64 [[TMP0]], i32 10
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i64> [[TMP15]], i64 [[TMP0]], i32 11
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i64> [[TMP16]], <16 x i64> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[A3]], align 16
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP18]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 [[TMP22]], [[TMP3]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i64 [[TMP3]], [[TMP3]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i64 [[TMP3]], [[TMP23]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = xor i64 [[TMP23]], [[TMP19]]
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = xor i64 [[TMP19]], [[TMP19]]
-; CHECK-NEXT:    [[OP_RDX5:%.*]] = xor i64 [[TMP20]], [[TMP20]]
-; CHECK-NEXT:    [[OP_RDX6:%.*]] = xor i64 [[TMP21]], [[TMP21]]
-; CHECK-NEXT:    [[OP_RDX7:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = xor i64 [[OP_RDX2]], [[OP_RDX3]]
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = xor i64 [[OP_RDX4]], [[OP_RDX5]]
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = xor i64 [[OP_RDX7]], [[OP_RDX8]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = xor i64 [[OP_RDX9]], [[OP_RDX6]]
-; CHECK-NEXT:    [[OP_RDX12]] = xor i64 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]]
+; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]]
+; CHECK-NEXT:    [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]]
  ; CHECK-NEXT:    br label [[WHILE]]
  ;
  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll

index b349c3a..254525c 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
@@ -6,11 +6,7 @@ define void @test() {
  ; CHECK-NEXT:    br i1 false, label [[PH:%.*]], label [[EXIT:%.*]]
  ; CHECK:       ph:
  ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> zeroinitializer)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i8 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer)
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], [[TMP2]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = and i8 [[OP_RDX1]], 0
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = and i8 0, [[TMP0]]
  ; CHECK-NEXT:    br label [[EXIT]]
  ; CHECK:       exit:
  ; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ [[OP_RDX2]], [[PH]] ], [ 0, [[BB:%.*]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll

index 264b0d9..8c8c644 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -15,8 +15,8 @@ define void @hoge() {
  ; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32>
  ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[TMP1]]
  ; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef
-; CHECK-NEXT:    [[SHUFFLE4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], <i32 15, i32 undef, i32 31, i32 47>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE2]], <i32 15, i32 undef, i32 31, i32 47>
  ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
  ; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef
  ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63
@@ -24,13 +24,10 @@ define void @hoge() {
  ; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef
  ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
  ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -17, i32 -33, i32 -33, i32 -49>
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = icmp slt i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP10]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = icmp slt i32 [[OP_RDX1]], undef
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef
-; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = icmp slt i32 undef, [[TMP9]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 undef, i32 [[TMP9]]
+; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX1]]
  ; CHECK-NEXT:    unreachable
  ;
  bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll

index f719cb5..edd1a2a 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
@@ -19,23 +19,19 @@ define void @test() {
  ; CHECK:       for.cond.preheader:
  ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 2
  ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[I]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX7:%.*]] = add i32 [[TMP2]], undef
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = add i32 [[OP_RDX7]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[I1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX5:%.*]] = add i32 [[TMP5]], undef
-; CHECK-NEXT:    [[OP_RDX6:%.*]] = add i32 [[OP_RDX5]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP6]], undef
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX8]], [[OP_RDX8]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX6]], [[OP_RDX6]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[I]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[TMP1]], undef
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[I1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[TMP3]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[OP_RDX3]], 2
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 undef, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[OP_RDX2]], 2
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP5]]
  ; CHECK-NEXT:    br label [[IF_END]]
  ; CHECK:       if.end:
-; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[OP_RDX4]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[OP_RDX1]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ]
  ; CHECK-NEXT:    ret void
  ;
  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll

index c855583..e146a0a 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
@@ -6,24 +6,13 @@
  define i16 @D134605() {
  ; CHECK-LABEL: @D134605(
  ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr poison, align 1
-; CHECK-NEXT:    [[ARRAYIDX101:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX101]], align 1
-; CHECK-NEXT:    [[ARRAYIDX107:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX107]], align 1
-; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]]
-; CHECK-NEXT:    [[ADD116:%.*]] = add i16 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[ADD122:%.*]] = add i16 [[ADD116]], [[TMP2]]
-; CHECK-NEXT:    [[ADD124:%.*]] = add i16 [[ADD122]], [[TMP3]]
-; CHECK-NEXT:    [[ADD125:%.*]] = add i16 [[ADD124]], poison
-; CHECK-NEXT:    [[FACTOR2531:%.*]] = add i16 [[TMP3]], [[ADD125]]
-; CHECK-NEXT:    [[ADD14332:%.*]] = add i16 [[FACTOR2531]], [[TMP2]]
-; CHECK-NEXT:    [[ADD14933:%.*]] = add i16 [[ADD14332]], [[TMP1]]
-; CHECK-NEXT:    [[ADD15534:%.*]] = add i16 [[ADD14933]], [[TMP0]]
-; CHECK-NEXT:    [[ADD15935:%.*]] = add i16 [[ADD15534]], poison
-; CHECK-NEXT:    [[REASS_MUL24:%.*]] = shl i16 [[ADD15935]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP2]], 2
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i16 [[TMP3]], poison
+; CHECK-NEXT:    [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX]], 2
  ; CHECK-NEXT:    [[CALL:%.*]] = call i16 @check_i16(i16 noundef 1, i16 noundef [[REASS_MUL24]], i16 noundef 5120)
  ; CHECK-NEXT:    unreachable
  ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll

index 99fd681..de4358c 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
@@ -6,16 +6,14 @@ define void @test() {
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i1> [ [[TMP8:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i1> [ [[TMP6:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
  ; CHECK-NEXT:    br label [[TMP1]]
  ; CHECK:       1:
  ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = or i8 0, 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i8> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i8> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i8> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP8]] = and <2 x i1> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i8> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6]] = and <2 x i1> [[TMP5]], zeroinitializer
  ; CHECK-NEXT:    br label [[FOR_BODY]]
  ;
  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll

index 16f0209..431f95d 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
@@ -11,9 +11,7 @@ define void @_Z2azv() local_unnamed_addr {
  ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
  ; CHECK-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], undef
  ; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 undef
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], undef
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX3]]
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX1]]
  ; CHECK-NEXT:    [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef
  ; CHECK-NEXT:    ret void
  ;
author	Alexey Bataev <a.bataev@outlook.com>
	Fri, 17 Jun 2022 17:15:23 +0000 (10:15 -0700)
committer	Alexey Bataev <a.bataev@outlook.com>
	Fri, 17 Feb 2023 15:19:35 +0000 (07:19 -0800)
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll		patch \| blob \| history
llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll		patch \| blob \| history