[X86] Move combineLoopMAddPattern and combineLoopSADPattern to an IR pass before...

author Craig Topper <craig.topper@gmail.com>

Thu, 26 Mar 2020 18:09:08 +0000 (11:09 -0700)

committer Craig Topper <craig.topper@gmail.com>

Thu, 26 Mar 2020 21:10:20 +0000 (14:10 -0700)
author Craig Topper <craig.topper@gmail.com>
Thu, 26 Mar 2020 18:09:08 +0000 (11:09 -0700)
committer Craig Topper <craig.topper@gmail.com>
Thu, 26 Mar 2020 21:10:20 +0000 (14:10 -0700)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h

index faee8e8..17b6636 100644 (file)
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -368,7 +368,6 @@ private:
    bool NoInfs : 1;
    bool NoSignedZeros : 1;
    bool AllowReciprocal : 1;
-  bool VectorReduction : 1;
    bool AllowContract : 1;
    bool ApproximateFuncs : 1;
    bool AllowReassociation : 1;
@@ -385,7 +384,7 @@ public:
    SDNodeFlags()
        : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false),
          Exact(false), NoNaNs(false), NoInfs(false),
-        NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false),
+        NoSignedZeros(false), AllowReciprocal(false),
          AllowContract(false), ApproximateFuncs(false),
          AllowReassociation(false), NoFPExcept(false) {}
  
@@ -434,10 +433,6 @@ public:
      setDefined();
      AllowReciprocal = b;
    }
-  void setVectorReduction(bool b) {
-    setDefined();
-    VectorReduction = b;
-  }
    void setAllowContract(bool b) {
      setDefined();
      AllowContract = b;
@@ -463,7 +458,6 @@ public:
    bool hasNoInfs() const { return NoInfs; }
    bool hasNoSignedZeros() const { return NoSignedZeros; }
    bool hasAllowReciprocal() const { return AllowReciprocal; }
-  bool hasVectorReduction() const { return VectorReduction; }
    bool hasAllowContract() const { return AllowContract; }
    bool hasApproximateFuncs() const { return ApproximateFuncs; }
    bool hasAllowReassociation() const { return AllowReassociation; }
@@ -481,7 +475,6 @@ public:
      NoInfs &= Flags.NoInfs;
      NoSignedZeros &= Flags.NoSignedZeros;
      AllowReciprocal &= Flags.AllowReciprocal;
-    VectorReduction &= Flags.VectorReduction;
      AllowContract &= Flags.AllowContract;
      ApproximateFuncs &= Flags.ApproximateFuncs;
      AllowReassociation &= Flags.AllowReassociation;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 574a80f..0763a5e 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -967,10 +967,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
    if (N0.getOpcode() != Opc)
      return SDValue();
  
-  // Don't reassociate reductions.
-  if (N0->getFlags().hasVectorReduction())
-    return SDValue();
-
    if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
      if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
        // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
@@ -995,9 +991,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
  SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                      SDValue N1, SDNodeFlags Flags) {
    assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
-  // Don't reassociate reductions.
-  if (Flags.hasVectorReduction())
-    return SDValue();
  
    // Floating-point reassociation is not allowed without loose FP math.
    if (N0.getValueType().isFloatingPoint() ||
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

index 1ad8620..d472ef9 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2998,133 +2998,6 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
    visitBinary(I, ISD::FSUB);
  }
  
-/// Checks if the given instruction performs a vector reduction, in which case
-/// we have the freedom to alter the elements in the result as long as the
-/// reduction of them stays unchanged.
-static bool isVectorReductionOp(const User *I) {
-  const Instruction *Inst = dyn_cast<Instruction>(I);
-  if (!Inst || !Inst->getType()->isVectorTy())
-    return false;
-
-  auto OpCode = Inst->getOpcode();
-  switch (OpCode) {
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    break;
-  case Instruction::FAdd:
-  case Instruction::FMul:
-    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-      if (FPOp->getFastMathFlags().isFast())
-        break;
-    LLVM_FALLTHROUGH;
-  default:
-    return false;
-  }
-
-  unsigned ElemNum = Inst->getType()->getVectorNumElements();
-  // Ensure the reduction size is a power of 2.
-  if (!isPowerOf2_32(ElemNum))
-    return false;
-
-  unsigned ElemNumToReduce = ElemNum;
-
-  // Do DFS search on the def-use chain from the given instruction. We only
-  // allow four kinds of operations during the search until we reach the
-  // instruction that extracts the first element from the vector:
-  //
-  //   1. The reduction operation of the same opcode as the given instruction.
-  //
-  //   2. PHI node.
-  //
-  //   3. ShuffleVector instruction together with a reduction operation that
-  //      does a partial reduction.
-  //
-  //   4. ExtractElement that extracts the first element from the vector, and we
-  //      stop searching the def-use chain here.
-  //
-  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
-  // from 1-3 to the stack to continue the DFS. The given instruction is not
-  // a reduction operation if we meet any other instructions other than those
-  // listed above.
-
-  SmallVector<const User *, 16> UsersToVisit{Inst};
-  SmallPtrSet<const User *, 16> Visited;
-  bool ReduxExtracted = false;
-
-  while (!UsersToVisit.empty()) {
-    auto User = UsersToVisit.back();
-    UsersToVisit.pop_back();
-    if (!Visited.insert(User).second)
-      continue;
-
-    for (const auto *U : User->users()) {
-      auto Inst = dyn_cast<Instruction>(U);
-      if (!Inst)
-        return false;
-
-      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
-        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
-            return false;
-        UsersToVisit.push_back(U);
-      } else if (const ShuffleVectorInst *ShufInst =
-                     dyn_cast<ShuffleVectorInst>(U)) {
-        // Detect the following pattern: A ShuffleVector instruction together
-        // with a reduction that do partial reduction on the first and second
-        // ElemNumToReduce / 2 elements, and store the result in
-        // ElemNumToReduce / 2 elements in another vector.
-
-        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
-        if (ResultElements < ElemNum)
-          return false;
-
-        if (ElemNumToReduce == 1)
-          return false;
-        if (!isa<UndefValue>(U->getOperand(1)))
-          return false;
-        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
-          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
-            return false;
-        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
-          if (ShufInst->getMaskValue(i) != -1)
-            return false;
-
-        // There is only one user of this ShuffleVector instruction, which
-        // must be a reduction operation.
-        if (!U->hasOneUse())
-          return false;
-
-        auto U2 = dyn_cast<Instruction>(*U->user_begin());
-        if (!U2 || U2->getOpcode() != OpCode)
-          return false;
-
-        // Check operands of the reduction operation.
-        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
-            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
-          UsersToVisit.push_back(U2);
-          ElemNumToReduce /= 2;
-        } else
-          return false;
-      } else if (isa<ExtractElementInst>(U)) {
-        // At this moment we should have reduced all elements in the vector.
-        if (ElemNumToReduce != 1)
-          return false;
-
-        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
-        if (!Val || !Val->isZero())
-          return false;
-
-        ReduxExtracted = true;
-      } else
-        return false;
-    }
-  }
-  return ReduxExtracted;
-}
-
  void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
    SDNodeFlags Flags;
  
@@ -3143,17 +3016,6 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
    if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
      Flags.setExact(ExactOp->isExact());
    }
-  if (isVectorReductionOp(&I)) {
-    Flags.setVectorReduction(true);
-    LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
-
-    // If no flags are set we will propagate the incoming flags, if any flags
-    // are set, we will intersect them with the incoming flag and so we need to
-    // copy the FMF flags here.
-    if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
-      Flags.copyFMF(*FPOp);
-    }
-  }
  
    SDValue Op1 = getValue(I.getOperand(0));
    SDValue Op2 = getValue(I.getOperand(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp

index aca462f..f81d18c 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -553,9 +553,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
    if (getFlags().hasAllowReassociation())
      OS << " reassoc";
  
-  if (getFlags().hasVectorReduction())
-    OS << " vector-reduction";
-
    if (getFlags().hasNoFPExcept())
      OS << " nofpexcept";
  
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt

index 3f0d68c..1542fc1 100644 (file)
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -56,6 +56,7 @@ set(sources
    X86MacroFusion.cpp
    X86OptimizeLEAs.cpp
    X86PadShortFunction.cpp
+  X86PartialReduction.cpp
    X86RegisterBankInfo.cpp
    X86RegisterInfo.cpp
    X86RetpolineThunks.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h

index c81b349..8c0a13c 100644 (file)
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -133,6 +133,11 @@ FunctionPass *createX86InsertPrefetchPass();
  /// fp exceptions when strict-fp enabled.
  FunctionPass *createX86InsertX87waitPass();
  
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
  InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                    X86Subtarget &,
                                                    X86RegisterBankInfo &);
@@ -154,6 +159,7 @@ void initializeX86ExecutionDomainFixPass(PassRegistry &);
  void initializeX86ExpandPseudoPass(PassRegistry &);
  void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
  void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
  void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
  
  namespace X86AS {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index e40055f..f91d7ff 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45902,131 +45902,6 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
                       DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
  }
  
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // If the vector size is less than 128, or greater than the supported RegSize,
-  // do not use PMADD.
-  if (!VT.isVector() || VT.getVectorNumElements() < 8)
-    return SDValue();
-
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  auto UsePMADDWD = [&](SDValue Op) {
-    ShrinkMode Mode;
-    return Op.getOpcode() == ISD::MUL &&
-           canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
-           Mode != ShrinkMode::MULU16 &&
-           (!Subtarget.hasSSE41() ||
-            (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-             Op->isOnlyUserOf(Op.getOperand(1).getNode())));
-  };
-
-  SDValue MulOp, OtherOp;
-  if (UsePMADDWD(Op0)) {
-    MulOp = Op0;
-    OtherOp = Op1;
-  } else if (UsePMADDWD(Op1)) {
-    MulOp = Op1;
-    OtherOp = Op0;
-  } else
-   return SDValue();
-
-  SDLoc DL(N);
-  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   VT.getVectorNumElements());
-  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                VT.getVectorNumElements() / 2);
-
-  // Shrink the operands of mul.
-  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
-  // Madd vector size is half of the original vector size
-  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                           ArrayRef<SDValue> Ops) {
-    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
-    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
-  };
-  SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
-                                  PMADDWDBuilder);
-  // Fill the rest of the output with 0
-  SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
-  if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) ||
-      VT.getVectorElementType() != MVT::i32)
-    return SDValue();
-
-  // We know N is a reduction add. To match SAD, we need one of the operands to
-  // be an ABS.
-  SDValue AbsOp = N->getOperand(0);
-  SDValue OtherOp = N->getOperand(1);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    std::swap(AbsOp, OtherOp);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    return SDValue();
-
-  // Check whether we have an abs-diff pattern feeding into the select.
-  SDValue SadOp0, SadOp1;
-  if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
-    return SDValue();
-
-  // SAD pattern detected. Now build a SAD instruction and an addition for
-  // reduction. Note that the number of elements of the result of SAD is less
-  // than the number of elements of its input. Therefore, we could only update
-  // part of elements in the reduction vector.
-  SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
-  // The output of PSADBW is a vector of i64.
-  // We need to turn the vector of i64 into a vector of i32.
-  // If the reduction vector is at least as wide as the psadbw result, just
-  // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
-  // the PSADBW will be zero.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
-  Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
-  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
-    // Fill the upper elements with zero to match the add width.
-    assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
-    unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
-    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
-    Ops[0] = Sad;
-    Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
-  } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
-    Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
-                      DAG.getIntPtrConstant(0, DL));
-  }
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
-}
-
  static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
                              const SDLoc &DL, EVT VT,
                              const X86Subtarget &Subtarget) {
@@ -46116,30 +45991,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
        Mode == ShrinkMode::MULU16)
      return SDValue();
  
+  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                 VT.getVectorNumElements() * 2);
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
    auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
-    // Shrink by adding truncate nodes and let DAGCombine fold with the
-    // sources.
      EVT InVT = Ops[0].getValueType();
-    assert(InVT.getScalarType() == MVT::i32 &&
-           "Unexpected scalar element type");
      assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
      EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                   InVT.getVectorNumElements() / 2);
-    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   InVT.getVectorNumElements());
-    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
    };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT,
-                          { Mul.getOperand(0), Mul.getOperand(1) },
-                          PMADDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
  }
  
  // Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-//      (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+//      (mul (sext (build_vector)), (sext (build_vector)))
  static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                                const SDLoc &DL, EVT VT,
                                const X86Subtarget &Subtarget) {
@@ -46261,13 +46131,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
  static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
-  const SDNodeFlags Flags = N->getFlags();
-  if (Flags.hasVectorReduction()) {
-    if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
-      return Sad;
-    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
-      return MAdd;
-  }
    EVT VT = N->getValueType(0);
    SDValue Op0 = N->getOperand(0);
    SDValue Op1 = N->getOperand(1);
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp

new file mode 100644 (file)

index 0000000..4cd231d
--- /dev/null
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -0,0 +1,460 @@
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+  const DataLayout *DL;
+  const X86Subtarget *ST;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  X86PartialReduction() : FunctionPass(ID) { }
+
+  bool runOnFunction(Function &Fn) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override {
+    return "X86 Partial Reduction";
+  }
+
+private:
+  bool tryMAddPattern(BinaryOperator *BO);
+  bool tryMAddReplacement(Value *Op, BinaryOperator *Add);
+
+  bool trySADPattern(BinaryOperator *BO);
+  bool trySADReplacement(Value *Op, BinaryOperator *Add);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+  return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+                "X86 Partial Reduction", false, false)
+
+static bool isVectorReductionOp(const BinaryOperator &BO) {
+  if (!BO.getType()->isVectorTy())
+    return false;
+
+  unsigned Opcode = BO.getOpcode();
+
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (auto *FPOp = dyn_cast<FPMathOperator>(&BO))
+      if (FPOp->getFastMathFlags().isFast())
+        break;
+    LLVM_FALLTHROUGH;
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = BO.getType()->getVectorNumElements();
+  // Ensure the reduction size is a power of 2.
+  if (!isPowerOf2_32(ElemNum))
+    return false;
+
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{&BO};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto *U : User->users()) {
+      auto *Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == Opcode || isa<PHINode>(U)) {
+        if (auto *FPOp = dyn_cast<FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (auto *ShufInst = dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+        if (ResultElements < ElemNum)
+          return false;
+
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which
+        // must be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto *U2 = dyn_cast<BinaryOperator>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != Opcode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        auto *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || !Val->isZero())
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
+bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
+  BasicBlock *BB = Add->getParent();
+
+  auto *BO = dyn_cast<BinaryOperator>(Op);
+  if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() ||
+      BO->getParent() != BB)
+    return false;
+
+  Value *LHS = BO->getOperand(0);
+  Value *RHS = BO->getOperand(1);
+
+  // LHS and RHS should be only used once or if they are the same then only
+  // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+  // instructions, otherwise we use punpck to emulate zero extend in stages. The
+  // trunc/ we need to do likely won't introduce new instructions in that case.
+  if (ST->hasSSE41()) {
+    if (LHS == RHS) {
+      if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+        return false;
+    } else {
+      if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+        return false;
+      if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+        return false;
+    }
+  }
+
+  auto canShrinkOp = [&](Value *Op) {
+    if (isa<Constant>(Op) && ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+      return true;
+    if (auto *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getParent() == BB &&
+          (Cast->getOpcode() == Instruction::SExt ||
+           Cast->getOpcode() == Instruction::ZExt) &&
+          ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+        return true;
+    }
+
+    return false;
+  };
+
+  // Both Ops need to be shrinkable.
+  if (!canShrinkOp(LHS) && !canShrinkOp(RHS))
+    return false;
+
+  IRBuilder<> Builder(Add);
+
+  Type *MulTy = Op->getType();
+  unsigned NumElts = MulTy->getVectorNumElements();
+
+  // Extract even elements and odd elements and add them together. This will
+  // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+  // half the original width.
+  SmallVector<uint32_t, 16> EvenMask(NumElts / 2);
+  SmallVector<uint32_t, 16> OddMask(NumElts / 2);
+  for (int i = 0, e = NumElts / 2; i != e; ++i) {
+    EvenMask[i] = i * 2;
+    OddMask[i] = i * 2 + 1;
+  }
+  Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask);
+  Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask);
+  Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+  // Concatenate zeroes to extend back to the original type.
+  SmallVector<uint32_t, 32> ConcatMask(NumElts);
+  std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+  Value *Zero = Constant::getNullValue(MAdd->getType());
+  Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+  // Replaces the use of mul in the original Add with the pmaddwd and zeroes.
+  Add->replaceUsesOfWith(BO, Concat);
+  Add->setHasNoSignedWrap(false);
+  Add->setHasNoUnsignedWrap(false);
+
+  return true;
+}
+
+// Try to replace operans of this add with pmaddwd patterns.
+bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // Need at least 8 elements.
+  if (BO->getType()->getVectorNumElements() < 8)
+    return false;
+
+  // Element type should be i32.
+  if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+    return false;
+
+  bool Changed = false;
+  Changed |= tryMAddReplacement(BO->getOperand(0), BO);
+  Changed |= tryMAddReplacement(BO->getOperand(1), BO);
+  return Changed;
+}
+
+bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
+  // Operand should be a select.
+  auto *SI = dyn_cast<SelectInst>(Op);
+  if (!SI)
+    return false;
+
+  // Select needs to implement absolute value.
+  Value *LHS, *RHS;
+  auto SPR = matchSelectPattern(SI, LHS, RHS);
+  if (SPR.Flavor != SPF_ABS)
+    return false;
+
+  // Need a subtract of two values.
+  auto *Sub = dyn_cast<BinaryOperator>(LHS);
+  if (!Sub || Sub->getOpcode() != Instruction::Sub)
+    return false;
+
+  // Look for zero extend from i8.
+  auto getZeroExtendedVal = [](Value *Op) -> Value * {
+    if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+      if (ZExt->getOperand(0)->getType()->getVectorElementType()->isIntegerTy(8))
+        return ZExt->getOperand(0);
+
+    return nullptr;
+  };
+
+  // Both operands of the subtract should be extends from vXi8.
+  Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+  Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+  if (!Op0 || !Op1)
+    return false;
+
+  IRBuilder<> Builder(Add);
+
+  Type *OpTy = Op->getType();
+  unsigned NumElts = OpTy->getVectorNumElements();
+
+  unsigned IntrinsicNumElts;
+  Intrinsic::ID IID;
+  if (ST->hasBWI() && NumElts >= 64) {
+    IID = Intrinsic::x86_avx512_psad_bw_512;
+    IntrinsicNumElts = 64;
+  } else if (ST->hasAVX2() && NumElts >= 32) {
+    IID = Intrinsic::x86_avx2_psad_bw;
+    IntrinsicNumElts = 32;
+  } else {
+    IID = Intrinsic::x86_sse2_psad_bw;
+    IntrinsicNumElts = 16;
+  }
+
+  Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID);
+
+  if (NumElts < 16) {
+    // Pad input with zeroes.
+    SmallVector<uint32_t, 32> ConcatMask(16);
+    for (unsigned i = 0; i != NumElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = NumElts; i != 16; ++i)
+      ConcatMask[i] = (i % NumElts) + NumElts;
+
+    Value *Zero = Constant::getNullValue(Op0->getType());
+    Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+    Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+    NumElts = 16;
+  }
+
+  // Intrinsics produce vXi64 and need to be casted to vXi32.
+  Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+  assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+  unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+  // First collect the pieces we need.
+  SmallVector<Value *, 4> Ops(NumSplits);
+  for (unsigned i = 0; i != NumSplits; ++i) {
+    SmallVector<uint32_t, 64> ExtractMask(IntrinsicNumElts);
+    std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+    Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+    Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+    Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+    Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+  }
+
+  assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+  unsigned Stages = Log2_32(NumSplits);
+  for (unsigned s = Stages; s > 0; --s) {
+    unsigned NumConcatElts = Ops[0]->getType()->getVectorNumElements() * 2;
+    for (unsigned i = 0; i != 1 << (s - 1); ++i) {
+      SmallVector<uint32_t, 64> ConcatMask(NumConcatElts);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+    }
+  }
+
+  // At this point the final value should be in Ops[0]. Now we need to adjust
+  // it to the final original type.
+  NumElts = OpTy->getVectorNumElements();
+  if (NumElts == 2) {
+    // Extract down to 2 elements.
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], {0, 1});
+  } else if (NumElts >= 8) {
+    SmallVector<uint32_t, 32> ConcatMask(NumElts);
+    unsigned SubElts = Ops[0]->getType()->getVectorNumElements();
+    for (unsigned i = 0; i != SubElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = SubElts; i != NumElts; ++i)
+      ConcatMask[i] = (i % SubElts) + SubElts;
+
+    Value *Zero = Constant::getNullValue(Ops[0]->getType());
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+  }
+
+  // Replaces the uses of Op in Add with the new sequence.
+  Add->replaceUsesOfWith(Op, Ops[0]);
+  Add->setHasNoSignedWrap(false);
+  Add->setHasNoUnsignedWrap(false);
+
+  return false;
+}
+
+bool X86PartialReduction::trySADPattern(BinaryOperator *BO) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+    return false;
+
+  bool Changed = false;
+  Changed |= trySADReplacement(BO->getOperand(0), BO);
+  Changed |= trySADReplacement(BO->getOperand(1), BO);
+  return Changed;
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<X86TargetMachine>();
+  ST = TM.getSubtargetImpl(F);
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      auto *BO = dyn_cast<BinaryOperator>(&I);
+      if (!BO)
+        continue;
+
+      if (!isVectorReductionOp(*BO))
+        continue;
+
+      if (BO->getOpcode() == Instruction::Add) {
+        if (tryMAddPattern(BO)) {
+          MadeChange = true;
+          continue;
+        }
+        if (trySADPattern(BO)) {
+          MadeChange = true;
+          continue;
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp

index dd6b678..d80c82c 100644 (file)
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -84,6 +84,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
    initializeX86FlagsCopyLoweringPassPass(PR);
    initializeX86CondBrFoldingPassPass(PR);
    initializeX86OptimizeLEAPassPass(PR);
+  initializeX86PartialReductionPass(PR);
  }
  
  static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -400,8 +401,10 @@ void X86PassConfig::addIRPasses() {
  
    TargetPassConfig::addIRPasses();
  
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
      addPass(createInterleavedAccessPass());
+    addPass(createX86PartialReductionPass());
+  }
  
    // Add passes that handle indirect branch removal and insertion of a retpoline
    // thunk. These will be a no-op unless a function subtarget has the retpoline
diff --git a/llvm/test/CodeGen/Generic/vector-redux.ll b/llvm/test/CodeGen/Generic/vector-redux.ll

deleted file mode 100644 (file)

index 8efdbf8..0000000
--- a/llvm/test/CodeGen/Generic/vector-redux.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-@a = global [1024 x i32] zeroinitializer, align 16
-
-define i32 @reduce_add() {
-; CHECK-LABEL: reduce_add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-
-min.iters.checked:
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
-  %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
-  %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
-  %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
-  %1 = bitcast i32* %0 to <4 x i32>*
-  %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
-  %2 = getelementptr i32, i32* %0, i64 4
-  %3 = bitcast i32* %2 to <4 x i32>*
-  %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
-  %4 = add nsw <4 x i32> %wide.load, %vec.phi
-  %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
-  %index.next = add nuw nsw i64 %index, 8
-  %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
-  %7 = bitcast i32* %6 to <4 x i32>*
-  %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
-  %8 = getelementptr i32, i32* %6, i64 4
-  %9 = bitcast i32* %8 to <4 x i32>*
-  %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
-  %10 = add nsw <4 x i32> %wide.load.1, %4
-  %11 = add nsw <4 x i32> %wide.load5.1, %5
-  %index.next.1 = add nsw i64 %index, 16
-  %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
-  %13 = bitcast i32* %12 to <4 x i32>*
-  %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
-  %14 = getelementptr i32, i32* %12, i64 4
-  %15 = bitcast i32* %14 to <4 x i32>*
-  %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
-  %16 = add nsw <4 x i32> %wide.load.2, %10
-  %17 = add nsw <4 x i32> %wide.load5.2, %11
-  %index.next.2 = add nsw i64 %index, 24
-  %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
-  %19 = bitcast i32* %18 to <4 x i32>*
-  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
-  %20 = getelementptr i32, i32* %18, i64 4
-  %21 = bitcast i32* %20 to <4 x i32>*
-  %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
-  %22 = add nsw <4 x i32> %wide.load.3, %16
-  %23 = add nsw <4 x i32> %wide.load5.3, %17
-  %index.next.3 = add nsw i64 %index, 32
-  %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
-  %25 = bitcast i32* %24 to <4 x i32>*
-  %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
-  %26 = getelementptr i32, i32* %24, i64 4
-  %27 = bitcast i32* %26 to <4 x i32>*
-  %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
-  %28 = add nsw <4 x i32> %wide.load.4, %22
-  %29 = add nsw <4 x i32> %wide.load5.4, %23
-  %index.next.4 = add nsw i64 %index, 40
-  %30 = icmp eq i64 %index.next.4, 1000
-  br i1 %30, label %middle.block, label %vector.body
-
-middle.block:
-  %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
-  %.lcssa = phi <4 x i32> [ %28, %vector.body ]
-  %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
-  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
-  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
-  %31 = extractelement <4 x i32> %bin.rdx8, i32 0
-  ret i32 %31
-}
-
-define i32 @reduce_and() {
-; CHECK-LABEL: reduce_and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-
-entry:
-  br label %vector.body
-
-vector.body:
-  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
-  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
-  %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
-  %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
-  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
-  %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
-  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
-  %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
-  %0 = and <4 x i32> %wide.load, %vec.phi
-  %1 = and <4 x i32> %wide.load10, %vec.phi9
-  %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
-  %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
-  %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
-  %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
-  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
-  %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
-  %2 = and <4 x i32> %wide.load.1, %0
-  %3 = and <4 x i32> %wide.load10.1, %1
-  %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
-  %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
-  %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
-  %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
-  %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
-  %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
-  %4 = and <4 x i32> %wide.load.2, %2
-  %5 = and <4 x i32> %wide.load10.2, %3
-  %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
-  %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
-  %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
-  %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
-  %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
-  %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
-  %6 = and <4 x i32> %wide.load.3, %4
-  %7 = and <4 x i32> %wide.load10.3, %5
-  %lsr.iv.next = add nsw i64 %lsr.iv, 128
-  %8 = icmp eq i64 %lsr.iv.next, 0
-  br i1 %8, label %middle.block, label %vector.body
-
-middle.block:
-  %bin.rdx = and <4 x i32> %7, %6
-  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
-  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
-  %9 = extractelement <4 x i32> %bin.rdx13, i32 0
-  ret i32 %9
-}
-
-define float @reduce_add_float(float* nocapture readonly %a) {
-; CHECK-LABEL: reduce_add_float
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-;
-entry:
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
-  %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
-  %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
-  %0 = getelementptr inbounds float, float* %a, i64 %index
-  %1 = bitcast float* %0 to <4 x float>*
-  %wide.load = load <4 x float>, <4 x float>* %1, align 4
-  %2 = getelementptr float, float* %0, i64 4
-  %3 = bitcast float* %2 to <4 x float>*
-  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
-  %4 = fadd fast <4 x float> %wide.load, %vec.phi
-  %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
-  %index.next = add nuw nsw i64 %index, 8
-  %6 = getelementptr inbounds float, float* %a, i64 %index.next
-  %7 = bitcast float* %6 to <4 x float>*
-  %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
-  %8 = getelementptr float, float* %6, i64 4
-  %9 = bitcast float* %8 to <4 x float>*
-  %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
-  %10 = fadd fast <4 x float> %wide.load.1, %4
-  %11 = fadd fast <4 x float> %wide.load10.1, %5
-  %index.next.1 = add nsw i64 %index, 16
-  %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
-  %13 = bitcast float* %12 to <4 x float>*
-  %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
-  %14 = getelementptr float, float* %12, i64 4
-  %15 = bitcast float* %14 to <4 x float>*
-  %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
-  %16 = fadd fast <4 x float> %wide.load.2, %10
-  %17 = fadd fast <4 x float> %wide.load10.2, %11
-  %index.next.2 = add nsw i64 %index, 24
-  %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
-  %19 = bitcast float* %18 to <4 x float>*
-  %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
-  %20 = getelementptr float, float* %18, i64 4
-  %21 = bitcast float* %20 to <4 x float>*
-  %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
-  %22 = fadd fast <4 x float> %wide.load.3, %16
-  %23 = fadd fast <4 x float> %wide.load10.3, %17
-  %index.next.3 = add nsw i64 %index, 32
-  %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
-  %25 = bitcast float* %24 to <4 x float>*
-  %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
-  %26 = getelementptr float, float* %24, i64 4
-  %27 = bitcast float* %26 to <4 x float>*
-  %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
-  %28 = fadd fast <4 x float> %wide.load.4, %22
-  %29 = fadd fast <4 x float> %wide.load10.4, %23
-  %index.next.4 = add nsw i64 %index, 40
-  %30 = icmp eq i64 %index.next.4, 1000
-  br i1 %30, label %middle.block, label %vector.body
-
-middle.block:
-  %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
-  %.lcssa = phi <4 x float> [ %28, %vector.body ]
-  %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
-  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
-  %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
-  %31 = extractelement <4 x float> %bin.rdx13, i32 0
-  ret float %31
-}
diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll

index c72676b..f22c701 100644 (file)
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -51,6 +51,7 @@
  ; CHECK-NEXT:       Expand reduction intrinsics
  ; CHECK-NEXT:       Dominator Tree Construction
  ; CHECK-NEXT:       Interleaved Access Pass
+; CHECK-NEXT:       X86 Partial Reduction
  ; CHECK-NEXT:       Expand indirectbr instructions
  ; CHECK-NEXT:       Dominator Tree Construction
  ; CHECK-NEXT:       Natural Loop Information
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll

index 43bb8ee..cad6f61 100644 (file)
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -236,10 +236,10 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
  ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
  ; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm2
  ; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm3
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
  ; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3
  ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
  ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
  ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
  ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
  ; AVX1-NEXT:    addq $16, %rcx
@@ -407,16 +407,16 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
  ; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm4
  ; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %xmm5
  ; AVX1-NEXT:    vmovdqu 48(%rsi,%rcx,2), %xmm6
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
+; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
+; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
  ; AVX1-NEXT:    vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
  ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
  ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
  ; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
  ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
  ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
  ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
  ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
  ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
  ; AVX1-NEXT:    addq $16, %rcx
@@ -453,10 +453,10 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
  ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
  ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
  ; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
-; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
  ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
  ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
  ; AVX2-NEXT:    addq $16, %rcx
  ; AVX2-NEXT:    cmpq %rcx, %rax
  ; AVX2-NEXT:    jne .LBB3_1
@@ -779,18 +779,18 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
  ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
  ; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm3
  ; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
  ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
  ; SSE2-NEXT:    psraw $8, %xmm6
  ; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  ; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  ; SSE2-NEXT:    psraw $8, %xmm4
  ; SSE2-NEXT:    pmaddwd %xmm3, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm1
  ; SSE2-NEXT:    addq $16, %rcx
  ; SSE2-NEXT:    cmpq %rcx, %rax
  ; SSE2-NEXT:    jne .LBB6_1
@@ -814,16 +814,16 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
  ; AVX1-NEXT:    .p2align 4, 0x90
  ; AVX1-NEXT:  .LBB6_1: # %vector.body
  ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm2
-; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm3
-; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm2
+; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
  ; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
  ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
  ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
  ; AVX1-NEXT:    addq $16, %rcx
  ; AVX1-NEXT:    cmpq %rcx, %rax
  ; AVX1-NEXT:    jne .LBB6_1
@@ -943,34 +943,34 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
  ; SSE2-NEXT:    .p2align 4, 0x90
  ; SSE2-NEXT:  .LBB7_1: # %vector.body
  ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm10
-; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm7
-; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm9
-; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm7
+; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm10
+; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm0
+; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
  ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
  ; SSE2-NEXT:    psraw $8, %xmm6
  ; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
  ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
  ; SSE2-NEXT:    psraw $8, %xmm0
  ; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
  ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
  ; SSE2-NEXT:    psraw $8, %xmm5
  ; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
  ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
  ; SSE2-NEXT:    psraw $8, %xmm5
  ; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm5, %xmm3
  ; SSE2-NEXT:    addq $32, %rcx
  ; SSE2-NEXT:    cmpq %rcx, %rax
  ; SSE2-NEXT:    jne .LBB7_1
@@ -999,26 +999,26 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
  ; AVX1-NEXT:    .p2align 4, 0x90
  ; AVX1-NEXT:  .LBB7_1: # %vector.body
  ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
-; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm4
-; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm5
-; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm6
-; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm3
+; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm4
+; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm5
+; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm6
+; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
  ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
  ; AVX1-NEXT:    vpmovsxbw 16(%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
  ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
  ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
  ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
  ; AVX1-NEXT:    addq $32, %rcx
  ; AVX1-NEXT:    cmpq %rcx, %rax
  ; AVX1-NEXT:    jne .LBB7_1
@@ -1051,14 +1051,14 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
  ; AVX2-NEXT:    .p2align 4, 0x90
  ; AVX2-NEXT:  .LBB7_1: # %vector.body
  ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
-; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
-; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
+; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
+; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
+; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
  ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
-; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
  ; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
+; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
  ; AVX2-NEXT:    addq $32, %rcx
  ; AVX2-NEXT:    cmpq %rcx, %rax
  ; AVX2-NEXT:    jne .LBB7_1
@@ -1913,9 +1913,9 @@ define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
  ;
  ; AVX1-LABEL: pmaddwd_16:
  ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
  ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
  ; AVX1-NEXT:    retq
@@ -1944,16 +1944,16 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
  ;
  ; AVX1-LABEL: pmaddwd_32:
  ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
  ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
  ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
  ; AVX1-NEXT:    retq
  ;
  ; AVX2-LABEL: pmaddwd_32:
@@ -1964,9 +1964,9 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
  ;
  ; AVX512F-LABEL: pmaddwd_32:
  ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
  ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
  ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
  ; AVX512F-NEXT:    retq
@@ -2126,9 +2126,9 @@ define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) {
  ;
  ; AVX1-LABEL: jumbled_indices8:
  ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
  ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
  ; AVX1-NEXT:    retq
@@ -2157,16 +2157,16 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
  ;
  ; AVX1-LABEL: jumbled_indices16:
  ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
  ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
  ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
  ; AVX1-NEXT:    retq
  ;
  ; AVX2-LABEL: jumbled_indices16:
@@ -2177,9 +2177,9 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
  ;
  ; AVX512F-LABEL: jumbled_indices16:
  ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
  ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
  ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
  ; AVX512F-NEXT:    retq
@@ -2221,26 +2221,26 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
  ;
  ; AVX1-LABEL: jumbled_indices32:
  ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm10
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm11
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm8, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm9, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm10, %xmm10
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm11, %xmm11
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpmaddwd %xmm8, %xmm9, %xmm8
  ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm8, %xmm4
  ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
  ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
  ; AVX1-NEXT:    vpmaddwd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
  ; AVX1-NEXT:    retq
  ;
  ; AVX2-LABEL: jumbled_indices32:
@@ -2656,7 +2656,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
  ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
  ; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
  ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
  ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
@@ -2698,14 +2698,14 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
  ; SSE2-NEXT:    movdqu (%r8), %xmm0
  ; SSE2-NEXT:    movdqu (%r9), %xmm3
  ; SSE2-NEXT:    pmaddwd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm3
  ; SSE2-NEXT:    movdqu (%rax), %xmm0
-; SSE2-NEXT:    movdqu (%r10), %xmm2
-; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqu (%r10), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
  ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
  ; SSE2-NEXT:    paddd %xmm0, %xmm1
  ; SSE2-NEXT:    movd %xmm1, %eax
@@ -2721,11 +2721,11 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
  ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
  ; AVX-NEXT:    vmovdqu (%r8), %xmm2
  ; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
  ; AVX-NEXT:    vmovdqu (%rax), %xmm2
  ; AVX-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
  ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll

index 543da1e..44ed303 100644 (file)
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -180,14 +180,14 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
  ; CHECK-NEXT:    .p2align 4, 0x90
  ; CHECK-NEXT:  .LBB8_1: # %vector.body
  ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
-; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
-; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
+; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
+; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
+; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
  ; CHECK-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
-; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
-; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
  ; CHECK-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
+; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
+; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
  ; CHECK-NEXT:    addq $32, %rcx
  ; CHECK-NEXT:    cmpq %rcx, %rax
  ; CHECK-NEXT:    jne .LBB8_1
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll

index 64845c8..6a74206 100644 (file)
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -158,12 +158,12 @@ define i32 @sad_32i8() nounwind {
  ; SSE2-NEXT:    .p2align 4, 0x90
  ; SSE2-NEXT:  .LBB1_1: # %vector.body
  ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
-; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
  ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
  ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm3
  ; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm1
  ; SSE2-NEXT:    addq $4, %rax
  ; SSE2-NEXT:    jne .LBB1_1
  ; SSE2-NEXT:  # %bb.2: # %middle.block
@@ -188,14 +188,14 @@ define i32 @sad_32i8() nounwind {
  ; AVX1-NEXT:    .p2align 4, 0x90
  ; AVX1-NEXT:  .LBB1_1: # %vector.body
  ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm2
-; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm2
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
  ; AVX1-NEXT:    addq $4, %rax
  ; AVX1-NEXT:    jne .LBB1_1
  ; AVX1-NEXT:  # %bb.2: # %middle.block
@@ -320,15 +320,15 @@ define i32 @sad_avx64i8() nounwind {
  ; SSE2-NEXT:    .p2align 4, 0x90
  ; SSE2-NEXT:  .LBB2_1: # %vector.body
  ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
-; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
-; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm3
  ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm5
  ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm5
  ; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm2
  ; SSE2-NEXT:    movdqa a+1072(%rax), %xmm5
  ; SSE2-NEXT:    psadbw b+1072(%rax), %xmm5
  ; SSE2-NEXT:    paddd %xmm5, %xmm1
@@ -364,22 +364,22 @@ define i32 @sad_avx64i8() nounwind {
  ; AVX1-NEXT:    .p2align 4, 0x90
  ; AVX1-NEXT:  .LBB2_1: # %vector.body
  ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm4
-; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm4
-; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm4
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm5
+; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm6
+; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
  ; AVX1-NEXT:    addq $4, %rax
  ; AVX1-NEXT:    jne .LBB2_1
  ; AVX1-NEXT:  # %bb.2: # %middle.block
@@ -416,12 +416,12 @@ define i32 @sad_avx64i8() nounwind {
  ; AVX2-NEXT:    .p2align 4, 0x90
  ; AVX2-NEXT:  .LBB2_1: # %vector.body
  ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
-; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
  ; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm3
  ; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
  ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
  ; AVX2-NEXT:    addq $4, %rax
  ; AVX2-NEXT:    jne .LBB2_1
  ; AVX2-NEXT:  # %bb.2: # %middle.block
@@ -449,11 +449,11 @@ define i32 @sad_avx64i8() nounwind {
  ; AVX512F-NEXT:    .p2align 4, 0x90
  ; AVX512F-NEXT:  .LBB2_1: # %vector.body
  ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm2
-; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm3
-; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm2
+; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
  ; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
  ; AVX512F-NEXT:    addq $4, %rax
  ; AVX512F-NEXT:    jne .LBB2_1
@@ -554,10 +554,10 @@ define i32 @sad_2i8() nounwind {
  ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
  ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
  ; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pand %xmm1, %xmm3
  ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psadbw %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psadbw %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm0
  ; SSE2-NEXT:    addq $4, %rax
  ; SSE2-NEXT:    jne .LBB3_1
  ; SSE2-NEXT:  # %bb.2: # %middle.block
@@ -576,8 +576,8 @@ define i32 @sad_2i8() nounwind {
  ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
  ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
  ; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
  ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
  ; AVX-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
  ; AVX-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
  ; AVX-NEXT:    addq $4, %rax
@@ -649,7 +649,7 @@ define i32 @sad_4i8() nounwind {
  ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
  ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
  ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
  ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    addq $4, %rax
  ; AVX-NEXT:    jne .LBB4_1
@@ -987,75 +987,36 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
  ; SSE2-NEXT:    movdqu (%rdi), %xmm0
  ; SSE2-NEXT:    movdqu (%rsi), %xmm1
  ; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqu (%rdx), %xmm0
+; SSE2-NEXT:    movdqu (%rcx), %xmm2
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
  ; SSE2-NEXT:    movl $1, %eax
  ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movdqu (%rdx), %xmm2
-; SSE2-NEXT:    movdqu (%rcx), %xmm3
-; SSE2-NEXT:    psadbw %xmm2, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
  ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
  ; SSE2-NEXT:    retq
  ;
-; AVX1-LABEL: sad_unroll_nonzero_initial:
-; AVX1:       # %bb.0: # %bb
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX1-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX1-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX1-NEXT:    movl $1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sad_unroll_nonzero_initial:
-; AVX2:       # %bb.0: # %bb
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX2-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vmovdqu (%rdx), %xmm2
-; AVX2-NEXT:    vpsadbw (%rcx), %xmm2, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sad_unroll_nonzero_initial:
-; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
-; AVX512-NEXT:    vmovdqu (%rdx), %xmm2
-; AVX512-NEXT:    vpsadbw (%rcx), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: sad_unroll_nonzero_initial:
+; AVX:       # %bb.0: # %bb
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    vmovd %eax, %xmm2
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
  bb:
    %tmp = load <16 x i8>, <16 x i8>* %arg, align 1
    %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
@@ -1112,7 +1073,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
  ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
  ; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
  ; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
  ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
author	Craig Topper <craig.topper@gmail.com>
	Thu, 26 Mar 2020 18:09:08 +0000 (11:09 -0700)
committer	Craig Topper <craig.topper@gmail.com>
	Thu, 26 Mar 2020 21:10:20 +0000 (14:10 -0700)
llvm/include/llvm/CodeGen/SelectionDAGNodes.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp		patch \| blob \| history
llvm/lib/Target/X86/CMakeLists.txt		patch \| blob \| history
llvm/lib/Target/X86/X86.h		patch \| blob \| history
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86PartialReduction.cpp	[new file with mode: 0644]	patch \| blob
llvm/lib/Target/X86/X86TargetMachine.cpp		patch \| blob \| history
llvm/test/CodeGen/Generic/vector-redux.ll	[deleted file]	patch \| blob \| history
llvm/test/CodeGen/X86/O3-pipeline.ll		patch \| blob \| history
llvm/test/CodeGen/X86/madd.ll		patch \| blob \| history
llvm/test/CodeGen/X86/min-legal-vector-width.ll		patch \| blob \| history
llvm/test/CodeGen/X86/sad.ll		patch \| blob \| history