bool NoInfs : 1;
bool NoSignedZeros : 1;
bool AllowReciprocal : 1;
- bool VectorReduction : 1;
bool AllowContract : 1;
bool ApproximateFuncs : 1;
bool AllowReassociation : 1;
SDNodeFlags()
: AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false),
Exact(false), NoNaNs(false), NoInfs(false),
- NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false),
+ NoSignedZeros(false), AllowReciprocal(false),
AllowContract(false), ApproximateFuncs(false),
AllowReassociation(false), NoFPExcept(false) {}
setDefined();
AllowReciprocal = b;
}
- void setVectorReduction(bool b) {
- setDefined();
- VectorReduction = b;
- }
void setAllowContract(bool b) {
setDefined();
AllowContract = b;
bool hasNoInfs() const { return NoInfs; }
bool hasNoSignedZeros() const { return NoSignedZeros; }
bool hasAllowReciprocal() const { return AllowReciprocal; }
- bool hasVectorReduction() const { return VectorReduction; }
bool hasAllowContract() const { return AllowContract; }
bool hasApproximateFuncs() const { return ApproximateFuncs; }
bool hasAllowReassociation() const { return AllowReassociation; }
NoInfs &= Flags.NoInfs;
NoSignedZeros &= Flags.NoSignedZeros;
AllowReciprocal &= Flags.AllowReciprocal;
- VectorReduction &= Flags.VectorReduction;
AllowContract &= Flags.AllowContract;
ApproximateFuncs &= Flags.ApproximateFuncs;
AllowReassociation &= Flags.AllowReassociation;
if (N0.getOpcode() != Opc)
return SDValue();
- // Don't reassociate reductions.
- if (N0->getFlags().hasVectorReduction())
- return SDValue();
-
if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1, SDNodeFlags Flags) {
assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
- // Don't reassociate reductions.
- if (Flags.hasVectorReduction())
- return SDValue();
// Floating-point reassociation is not allowed without loose FP math.
if (N0.getValueType().isFloatingPoint() ||
visitBinary(I, ISD::FSUB);
}
-/// Checks if the given instruction performs a vector reduction, in which case
-/// we have the freedom to alter the elements in the result as long as the
-/// reduction of them stays unchanged.
-static bool isVectorReductionOp(const User *I) {
- const Instruction *Inst = dyn_cast<Instruction>(I);
- if (!Inst || !Inst->getType()->isVectorTy())
- return false;
-
- auto OpCode = Inst->getOpcode();
- switch (OpCode) {
- case Instruction::Add:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- break;
- case Instruction::FAdd:
- case Instruction::FMul:
- if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
- if (FPOp->getFastMathFlags().isFast())
- break;
- LLVM_FALLTHROUGH;
- default:
- return false;
- }
-
- unsigned ElemNum = Inst->getType()->getVectorNumElements();
- // Ensure the reduction size is a power of 2.
- if (!isPowerOf2_32(ElemNum))
- return false;
-
- unsigned ElemNumToReduce = ElemNum;
-
- // Do DFS search on the def-use chain from the given instruction. We only
- // allow four kinds of operations during the search until we reach the
- // instruction that extracts the first element from the vector:
- //
- // 1. The reduction operation of the same opcode as the given instruction.
- //
- // 2. PHI node.
- //
- // 3. ShuffleVector instruction together with a reduction operation that
- // does a partial reduction.
- //
- // 4. ExtractElement that extracts the first element from the vector, and we
- // stop searching the def-use chain here.
- //
- // 3 & 4 above perform a reduction on all elements of the vector. We push defs
- // from 1-3 to the stack to continue the DFS. The given instruction is not
- // a reduction operation if we meet any other instructions other than those
- // listed above.
-
- SmallVector<const User *, 16> UsersToVisit{Inst};
- SmallPtrSet<const User *, 16> Visited;
- bool ReduxExtracted = false;
-
- while (!UsersToVisit.empty()) {
- auto User = UsersToVisit.back();
- UsersToVisit.pop_back();
- if (!Visited.insert(User).second)
- continue;
-
- for (const auto *U : User->users()) {
- auto Inst = dyn_cast<Instruction>(U);
- if (!Inst)
- return false;
-
- if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
- if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
- if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
- return false;
- UsersToVisit.push_back(U);
- } else if (const ShuffleVectorInst *ShufInst =
- dyn_cast<ShuffleVectorInst>(U)) {
- // Detect the following pattern: A ShuffleVector instruction together
- // with a reduction that do partial reduction on the first and second
- // ElemNumToReduce / 2 elements, and store the result in
- // ElemNumToReduce / 2 elements in another vector.
-
- unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
- if (ResultElements < ElemNum)
- return false;
-
- if (ElemNumToReduce == 1)
- return false;
- if (!isa<UndefValue>(U->getOperand(1)))
- return false;
- for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
- if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
- return false;
- for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
- if (ShufInst->getMaskValue(i) != -1)
- return false;
-
- // There is only one user of this ShuffleVector instruction, which
- // must be a reduction operation.
- if (!U->hasOneUse())
- return false;
-
- auto U2 = dyn_cast<Instruction>(*U->user_begin());
- if (!U2 || U2->getOpcode() != OpCode)
- return false;
-
- // Check operands of the reduction operation.
- if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
- (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
- UsersToVisit.push_back(U2);
- ElemNumToReduce /= 2;
- } else
- return false;
- } else if (isa<ExtractElementInst>(U)) {
- // At this moment we should have reduced all elements in the vector.
- if (ElemNumToReduce != 1)
- return false;
-
- const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
- if (!Val || !Val->isZero())
- return false;
-
- ReduxExtracted = true;
- } else
- return false;
- }
- }
- return ReduxExtracted;
-}
-
void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
SDNodeFlags Flags;
if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
Flags.setExact(ExactOp->isExact());
}
- if (isVectorReductionOp(&I)) {
- Flags.setVectorReduction(true);
- LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
-
- // If no flags are set we will propagate the incoming flags, if any flags
- // are set, we will intersect them with the incoming flag and so we need to
- // copy the FMF flags here.
- if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
- Flags.copyFMF(*FPOp);
- }
- }
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
if (getFlags().hasAllowReassociation())
OS << " reassoc";
- if (getFlags().hasVectorReduction())
- OS << " vector-reduction";
-
if (getFlags().hasNoFPExcept())
OS << " nofpexcept";
X86MacroFusion.cpp
X86OptimizeLEAs.cpp
X86PadShortFunction.cpp
+ X86PartialReduction.cpp
X86RegisterBankInfo.cpp
X86RegisterInfo.cpp
X86RetpolineThunks.cpp
/// fp exceptions when strict-fp enabled.
FunctionPass *createX86InsertX87waitPass();
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &,
X86RegisterBankInfo &);
void initializeX86ExpandPseudoPass(PassRegistry &);
void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
namespace X86AS {
DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
}
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // If the vector size is less than 128, or greater than the supported RegSize,
- // do not use PMADD.
- if (!VT.isVector() || VT.getVectorNumElements() < 8)
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
-
- auto UsePMADDWD = [&](SDValue Op) {
- ShrinkMode Mode;
- return Op.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
- Mode != ShrinkMode::MULU16 &&
- (!Subtarget.hasSSE41() ||
- (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- Op->isOnlyUserOf(Op.getOperand(1).getNode())));
- };
-
- SDValue MulOp, OtherOp;
- if (UsePMADDWD(Op0)) {
- MulOp = Op0;
- OtherOp = Op1;
- } else if (UsePMADDWD(Op1)) {
- MulOp = Op1;
- OtherOp = Op0;
- } else
- return SDValue();
-
- SDLoc DL(N);
- EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- VT.getVectorNumElements());
- EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- VT.getVectorNumElements() / 2);
-
- // Shrink the operands of mul.
- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
- // Madd vector size is half of the original vector size
- auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
- };
- SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
- PMADDWDBuilder);
- // Fill the rest of the output with 0
- SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
- // Preserve the reduction flag on the ADD. We may need to revisit for the
- // other operand.
- SDNodeFlags Flags;
- Flags.setVectorReduction(true);
- return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // TODO: There's nothing special about i32, any integer type above i16 should
- // work just as well.
- if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) ||
- VT.getVectorElementType() != MVT::i32)
- return SDValue();
-
- // We know N is a reduction add. To match SAD, we need one of the operands to
- // be an ABS.
- SDValue AbsOp = N->getOperand(0);
- SDValue OtherOp = N->getOperand(1);
- if (AbsOp.getOpcode() != ISD::ABS)
- std::swap(AbsOp, OtherOp);
- if (AbsOp.getOpcode() != ISD::ABS)
- return SDValue();
-
- // Check whether we have an abs-diff pattern feeding into the select.
- SDValue SadOp0, SadOp1;
- if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
- return SDValue();
-
- // SAD pattern detected. Now build a SAD instruction and an addition for
- // reduction. Note that the number of elements of the result of SAD is less
- // than the number of elements of its input. Therefore, we could only update
- // part of elements in the reduction vector.
- SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
- // The output of PSADBW is a vector of i64.
- // We need to turn the vector of i64 into a vector of i32.
- // If the reduction vector is at least as wide as the psadbw result, just
- // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
- // the PSADBW will be zero.
- MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
- Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
- if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
- // Fill the upper elements with zero to match the add width.
- assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
- unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
- Ops[0] = Sad;
- Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
- } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
- Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // Preserve the reduction flag on the ADD. We may need to revisit for the
- // other operand.
- SDNodeFlags Flags;
- Flags.setVectorReduction(true);
- return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
-}
-
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
Mode == ShrinkMode::MULU16)
return SDValue();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements() * 2);
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- // Shrink by adding truncate nodes and let DAGCombine fold with the
- // sources.
EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i32 &&
- "Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements() / 2);
- EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- InVT.getVectorNumElements());
- return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Mul.getOperand(0), Mul.getOperand(1) },
- PMADDBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
}
// Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-// (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+// (mul (sext (build_vector)), (sext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- const SDNodeFlags Flags = N->getFlags();
- if (Flags.hasVectorReduction()) {
- if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
- return Sad;
- if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
- return MAdd;
- }
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
--- /dev/null
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+ const DataLayout *DL;
+ const X86Subtarget *ST;
+
+public:
+ static char ID; // Pass identification, replacement for typeid.
+
+ X86PartialReduction() : FunctionPass(ID) { }
+
+ bool runOnFunction(Function &Fn) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Partial Reduction";
+ }
+
+private:
+ bool tryMAddPattern(BinaryOperator *BO);
+ bool tryMAddReplacement(Value *Op, BinaryOperator *Add);
+
+ bool trySADPattern(BinaryOperator *BO);
+ bool trySADReplacement(Value *Op, BinaryOperator *Add);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+ return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+ "X86 Partial Reduction", false, false)
+
+static bool isVectorReductionOp(const BinaryOperator &BO) {
+ if (!BO.getType()->isVectorTy())
+ return false;
+
+ unsigned Opcode = BO.getOpcode();
+
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ break;
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ if (auto *FPOp = dyn_cast<FPMathOperator>(&BO))
+ if (FPOp->getFastMathFlags().isFast())
+ break;
+ LLVM_FALLTHROUGH;
+ default:
+ return false;
+ }
+
+ unsigned ElemNum = BO.getType()->getVectorNumElements();
+ // Ensure the reduction size is a power of 2.
+ if (!isPowerOf2_32(ElemNum))
+ return false;
+
+ unsigned ElemNumToReduce = ElemNum;
+
+ // Do DFS search on the def-use chain from the given instruction. We only
+ // allow four kinds of operations during the search until we reach the
+ // instruction that extracts the first element from the vector:
+ //
+ // 1. The reduction operation of the same opcode as the given instruction.
+ //
+ // 2. PHI node.
+ //
+ // 3. ShuffleVector instruction together with a reduction operation that
+ // does a partial reduction.
+ //
+ // 4. ExtractElement that extracts the first element from the vector, and we
+ // stop searching the def-use chain here.
+ //
+ // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+ // from 1-3 to the stack to continue the DFS. The given instruction is not
+ // a reduction operation if we meet any other instructions other than those
+ // listed above.
+
+ SmallVector<const User *, 16> UsersToVisit{&BO};
+ SmallPtrSet<const User *, 16> Visited;
+ bool ReduxExtracted = false;
+
+ while (!UsersToVisit.empty()) {
+ auto User = UsersToVisit.back();
+ UsersToVisit.pop_back();
+ if (!Visited.insert(User).second)
+ continue;
+
+ for (const auto *U : User->users()) {
+ auto *Inst = dyn_cast<Instruction>(U);
+ if (!Inst)
+ return false;
+
+ if (Inst->getOpcode() == Opcode || isa<PHINode>(U)) {
+ if (auto *FPOp = dyn_cast<FPMathOperator>(Inst))
+ if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
+ return false;
+ UsersToVisit.push_back(U);
+ } else if (auto *ShufInst = dyn_cast<ShuffleVectorInst>(U)) {
+ // Detect the following pattern: A ShuffleVector instruction together
+ // with a reduction that do partial reduction on the first and second
+ // ElemNumToReduce / 2 elements, and store the result in
+ // ElemNumToReduce / 2 elements in another vector.
+
+ unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+ if (ResultElements < ElemNum)
+ return false;
+
+ if (ElemNumToReduce == 1)
+ return false;
+ if (!isa<UndefValue>(U->getOperand(1)))
+ return false;
+ for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+ if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+ return false;
+ for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+ if (ShufInst->getMaskValue(i) != -1)
+ return false;
+
+ // There is only one user of this ShuffleVector instruction, which
+ // must be a reduction operation.
+ if (!U->hasOneUse())
+ return false;
+
+ auto *U2 = dyn_cast<BinaryOperator>(*U->user_begin());
+ if (!U2 || U2->getOpcode() != Opcode)
+ return false;
+
+ // Check operands of the reduction operation.
+ if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+ (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+ UsersToVisit.push_back(U2);
+ ElemNumToReduce /= 2;
+ } else
+ return false;
+ } else if (isa<ExtractElementInst>(U)) {
+ // At this moment we should have reduced all elements in the vector.
+ if (ElemNumToReduce != 1)
+ return false;
+
+ auto *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+ if (!Val || !Val->isZero())
+ return false;
+
+ ReduxExtracted = true;
+ } else
+ return false;
+ }
+ }
+ return ReduxExtracted;
+}
+
+bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
+ BasicBlock *BB = Add->getParent();
+
+ auto *BO = dyn_cast<BinaryOperator>(Op);
+ if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() ||
+ BO->getParent() != BB)
+ return false;
+
+ Value *LHS = BO->getOperand(0);
+ Value *RHS = BO->getOperand(1);
+
+ // LHS and RHS should be only used once or if they are the same then only
+ // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+ // instructions, otherwise we use punpck to emulate zero extend in stages. The
+ // trunc/ we need to do likely won't introduce new instructions in that case.
+ if (ST->hasSSE41()) {
+ if (LHS == RHS) {
+ if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+ return false;
+ } else {
+ if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+ return false;
+ if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+ return false;
+ }
+ }
+
+ auto canShrinkOp = [&](Value *Op) {
+ if (isa<Constant>(Op) && ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+ return true;
+ if (auto *Cast = dyn_cast<CastInst>(Op)) {
+ if (Cast->getParent() == BB &&
+ (Cast->getOpcode() == Instruction::SExt ||
+ Cast->getOpcode() == Instruction::ZExt) &&
+ ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+ return true;
+ }
+
+ return false;
+ };
+
+ // Both Ops need to be shrinkable.
+ if (!canShrinkOp(LHS) && !canShrinkOp(RHS))
+ return false;
+
+ IRBuilder<> Builder(Add);
+
+ Type *MulTy = Op->getType();
+ unsigned NumElts = MulTy->getVectorNumElements();
+
+ // Extract even elements and odd elements and add them together. This will
+ // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+ // half the original width.
+ SmallVector<uint32_t, 16> EvenMask(NumElts / 2);
+ SmallVector<uint32_t, 16> OddMask(NumElts / 2);
+ for (int i = 0, e = NumElts / 2; i != e; ++i) {
+ EvenMask[i] = i * 2;
+ OddMask[i] = i * 2 + 1;
+ }
+ Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask);
+ Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask);
+ Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+ // Concatenate zeroes to extend back to the original type.
+ SmallVector<uint32_t, 32> ConcatMask(NumElts);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Value *Zero = Constant::getNullValue(MAdd->getType());
+ Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+ // Replaces the use of mul in the original Add with the pmaddwd and zeroes.
+ Add->replaceUsesOfWith(BO, Concat);
+ Add->setHasNoSignedWrap(false);
+ Add->setHasNoUnsignedWrap(false);
+
+ return true;
+}
+
+// Try to replace operans of this add with pmaddwd patterns.
+bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) {
+ if (!ST->hasSSE2())
+ return false;
+
+ // Need at least 8 elements.
+ if (BO->getType()->getVectorNumElements() < 8)
+ return false;
+
+ // Element type should be i32.
+ if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+ return false;
+
+ bool Changed = false;
+ Changed |= tryMAddReplacement(BO->getOperand(0), BO);
+ Changed |= tryMAddReplacement(BO->getOperand(1), BO);
+ return Changed;
+}
+
+bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
+ // Operand should be a select.
+ auto *SI = dyn_cast<SelectInst>(Op);
+ if (!SI)
+ return false;
+
+ // Select needs to implement absolute value.
+ Value *LHS, *RHS;
+ auto SPR = matchSelectPattern(SI, LHS, RHS);
+ if (SPR.Flavor != SPF_ABS)
+ return false;
+
+ // Need a subtract of two values.
+ auto *Sub = dyn_cast<BinaryOperator>(LHS);
+ if (!Sub || Sub->getOpcode() != Instruction::Sub)
+ return false;
+
+ // Look for zero extend from i8.
+ auto getZeroExtendedVal = [](Value *Op) -> Value * {
+ if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+ if (ZExt->getOperand(0)->getType()->getVectorElementType()->isIntegerTy(8))
+ return ZExt->getOperand(0);
+
+ return nullptr;
+ };
+
+ // Both operands of the subtract should be extends from vXi8.
+ Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+ Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+ if (!Op0 || !Op1)
+ return false;
+
+ IRBuilder<> Builder(Add);
+
+ Type *OpTy = Op->getType();
+ unsigned NumElts = OpTy->getVectorNumElements();
+
+ unsigned IntrinsicNumElts;
+ Intrinsic::ID IID;
+ if (ST->hasBWI() && NumElts >= 64) {
+ IID = Intrinsic::x86_avx512_psad_bw_512;
+ IntrinsicNumElts = 64;
+ } else if (ST->hasAVX2() && NumElts >= 32) {
+ IID = Intrinsic::x86_avx2_psad_bw;
+ IntrinsicNumElts = 32;
+ } else {
+ IID = Intrinsic::x86_sse2_psad_bw;
+ IntrinsicNumElts = 16;
+ }
+
+ Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID);
+
+ if (NumElts < 16) {
+ // Pad input with zeroes.
+ SmallVector<uint32_t, 32> ConcatMask(16);
+ for (unsigned i = 0; i != NumElts; ++i)
+ ConcatMask[i] = i;
+ for (unsigned i = NumElts; i != 16; ++i)
+ ConcatMask[i] = (i % NumElts) + NumElts;
+
+ Value *Zero = Constant::getNullValue(Op0->getType());
+ Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+ Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+ NumElts = 16;
+ }
+
+ // Intrinsics produce vXi64 and need to be casted to vXi32.
+ Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+ assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+ unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+ // First collect the pieces we need.
+ SmallVector<Value *, 4> Ops(NumSplits);
+ for (unsigned i = 0; i != NumSplits; ++i) {
+ SmallVector<uint32_t, 64> ExtractMask(IntrinsicNumElts);
+ std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+ Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+ Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+ Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+ Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+ }
+
+ assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+ unsigned Stages = Log2_32(NumSplits);
+ for (unsigned s = Stages; s > 0; --s) {
+ unsigned NumConcatElts = Ops[0]->getType()->getVectorNumElements() * 2;
+ for (unsigned i = 0; i != 1 << (s - 1); ++i) {
+ SmallVector<uint32_t, 64> ConcatMask(NumConcatElts);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+ }
+ }
+
+ // At this point the final value should be in Ops[0]. Now we need to adjust
+ // it to the final original type.
+ NumElts = OpTy->getVectorNumElements();
+ if (NumElts == 2) {
+ // Extract down to 2 elements.
+ Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], {0, 1});
+ } else if (NumElts >= 8) {
+ SmallVector<uint32_t, 32> ConcatMask(NumElts);
+ unsigned SubElts = Ops[0]->getType()->getVectorNumElements();
+ for (unsigned i = 0; i != SubElts; ++i)
+ ConcatMask[i] = i;
+ for (unsigned i = SubElts; i != NumElts; ++i)
+ ConcatMask[i] = (i % SubElts) + SubElts;
+
+ Value *Zero = Constant::getNullValue(Ops[0]->getType());
+ Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+ }
+
+ // Replaces the uses of Op in Add with the new sequence.
+ Add->replaceUsesOfWith(Op, Ops[0]);
+ Add->setHasNoSignedWrap(false);
+ Add->setHasNoUnsignedWrap(false);
+
+ return false;
+}
+
+bool X86PartialReduction::trySADPattern(BinaryOperator *BO) {
+ if (!ST->hasSSE2())
+ return false;
+
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
+ if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+ return false;
+
+ bool Changed = false;
+ Changed |= trySADReplacement(BO->getOperand(0), BO);
+ Changed |= trySADReplacement(BO->getOperand(1), BO);
+ return Changed;
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ auto &TM = TPC->getTM<X86TargetMachine>();
+ ST = TM.getSubtargetImpl(F);
+
+ DL = &F.getParent()->getDataLayout();
+
+ bool MadeChange = false;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *BO = dyn_cast<BinaryOperator>(&I);
+ if (!BO)
+ continue;
+
+ if (!isVectorReductionOp(*BO))
+ continue;
+
+ if (BO->getOpcode() == Instruction::Add) {
+ if (tryMAddPattern(BO)) {
+ MadeChange = true;
+ continue;
+ }
+ if (trySADPattern(BO)) {
+ MadeChange = true;
+ continue;
+ }
+ }
+ }
+ }
+
+ return MadeChange;
+}
initializeX86FlagsCopyLoweringPassPass(PR);
initializeX86CondBrFoldingPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
+ initializeX86PartialReductionPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
TargetPassConfig::addIRPasses();
- if (TM->getOptLevel() != CodeGenOpt::None)
+ if (TM->getOptLevel() != CodeGenOpt::None) {
addPass(createInterleavedAccessPass());
+ addPass(createX86PartialReductionPass());
+ }
// Add passes that handle indirect branch removal and insertion of a retpoline
// thunk. These will be a no-op unless a function subtarget has the retpoline
+++ /dev/null
-; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-@a = global [1024 x i32] zeroinitializer, align 16
-
-define i32 @reduce_add() {
-; CHECK-LABEL: reduce_add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-; CHECK: Detected a reduction operation: {{.*}} add
-
-min.iters.checked:
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
- %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
- %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
- %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
- %1 = bitcast i32* %0 to <4 x i32>*
- %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
- %2 = getelementptr i32, i32* %0, i64 4
- %3 = bitcast i32* %2 to <4 x i32>*
- %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
- %4 = add nsw <4 x i32> %wide.load, %vec.phi
- %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
- %index.next = add nuw nsw i64 %index, 8
- %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
- %7 = bitcast i32* %6 to <4 x i32>*
- %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
- %8 = getelementptr i32, i32* %6, i64 4
- %9 = bitcast i32* %8 to <4 x i32>*
- %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
- %10 = add nsw <4 x i32> %wide.load.1, %4
- %11 = add nsw <4 x i32> %wide.load5.1, %5
- %index.next.1 = add nsw i64 %index, 16
- %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
- %13 = bitcast i32* %12 to <4 x i32>*
- %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
- %14 = getelementptr i32, i32* %12, i64 4
- %15 = bitcast i32* %14 to <4 x i32>*
- %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
- %16 = add nsw <4 x i32> %wide.load.2, %10
- %17 = add nsw <4 x i32> %wide.load5.2, %11
- %index.next.2 = add nsw i64 %index, 24
- %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
- %19 = bitcast i32* %18 to <4 x i32>*
- %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
- %20 = getelementptr i32, i32* %18, i64 4
- %21 = bitcast i32* %20 to <4 x i32>*
- %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
- %22 = add nsw <4 x i32> %wide.load.3, %16
- %23 = add nsw <4 x i32> %wide.load5.3, %17
- %index.next.3 = add nsw i64 %index, 32
- %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
- %25 = bitcast i32* %24 to <4 x i32>*
- %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
- %26 = getelementptr i32, i32* %24, i64 4
- %27 = bitcast i32* %26 to <4 x i32>*
- %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
- %28 = add nsw <4 x i32> %wide.load.4, %22
- %29 = add nsw <4 x i32> %wide.load5.4, %23
- %index.next.4 = add nsw i64 %index, 40
- %30 = icmp eq i64 %index.next.4, 1000
- br i1 %30, label %middle.block, label %vector.body
-
-middle.block:
- %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
- %.lcssa = phi <4 x i32> [ %28, %vector.body ]
- %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
- %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
- %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
- %31 = extractelement <4 x i32> %bin.rdx8, i32 0
- ret i32 %31
-}
-
-define i32 @reduce_and() {
-; CHECK-LABEL: reduce_and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-; CHECK: Detected a reduction operation: {{.*}} and
-
-entry:
- br label %vector.body
-
-vector.body:
- %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
- %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
- %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
- %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
- %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
- %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
- %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
- %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
- %0 = and <4 x i32> %wide.load, %vec.phi
- %1 = and <4 x i32> %wide.load10, %vec.phi9
- %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
- %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
- %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
- %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
- %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
- %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
- %2 = and <4 x i32> %wide.load.1, %0
- %3 = and <4 x i32> %wide.load10.1, %1
- %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
- %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
- %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
- %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
- %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
- %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
- %4 = and <4 x i32> %wide.load.2, %2
- %5 = and <4 x i32> %wide.load10.2, %3
- %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
- %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
- %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
- %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
- %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
- %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
- %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
- %6 = and <4 x i32> %wide.load.3, %4
- %7 = and <4 x i32> %wide.load10.3, %5
- %lsr.iv.next = add nsw i64 %lsr.iv, 128
- %8 = icmp eq i64 %lsr.iv.next, 0
- br i1 %8, label %middle.block, label %vector.body
-
-middle.block:
- %bin.rdx = and <4 x i32> %7, %6
- %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
- %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
- %9 = extractelement <4 x i32> %bin.rdx13, i32 0
- ret i32 %9
-}
-
-define float @reduce_add_float(float* nocapture readonly %a) {
-; CHECK-LABEL: reduce_add_float
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-; CHECK: Detected a reduction operation: {{.*}} fadd fast
-;
-entry:
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
- %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
- %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
- %0 = getelementptr inbounds float, float* %a, i64 %index
- %1 = bitcast float* %0 to <4 x float>*
- %wide.load = load <4 x float>, <4 x float>* %1, align 4
- %2 = getelementptr float, float* %0, i64 4
- %3 = bitcast float* %2 to <4 x float>*
- %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
- %4 = fadd fast <4 x float> %wide.load, %vec.phi
- %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
- %index.next = add nuw nsw i64 %index, 8
- %6 = getelementptr inbounds float, float* %a, i64 %index.next
- %7 = bitcast float* %6 to <4 x float>*
- %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
- %8 = getelementptr float, float* %6, i64 4
- %9 = bitcast float* %8 to <4 x float>*
- %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
- %10 = fadd fast <4 x float> %wide.load.1, %4
- %11 = fadd fast <4 x float> %wide.load10.1, %5
- %index.next.1 = add nsw i64 %index, 16
- %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
- %13 = bitcast float* %12 to <4 x float>*
- %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
- %14 = getelementptr float, float* %12, i64 4
- %15 = bitcast float* %14 to <4 x float>*
- %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
- %16 = fadd fast <4 x float> %wide.load.2, %10
- %17 = fadd fast <4 x float> %wide.load10.2, %11
- %index.next.2 = add nsw i64 %index, 24
- %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
- %19 = bitcast float* %18 to <4 x float>*
- %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
- %20 = getelementptr float, float* %18, i64 4
- %21 = bitcast float* %20 to <4 x float>*
- %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
- %22 = fadd fast <4 x float> %wide.load.3, %16
- %23 = fadd fast <4 x float> %wide.load10.3, %17
- %index.next.3 = add nsw i64 %index, 32
- %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
- %25 = bitcast float* %24 to <4 x float>*
- %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
- %26 = getelementptr float, float* %24, i64 4
- %27 = bitcast float* %26 to <4 x float>*
- %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
- %28 = fadd fast <4 x float> %wide.load.4, %22
- %29 = fadd fast <4 x float> %wide.load10.4, %23
- %index.next.4 = add nsw i64 %index, 40
- %30 = icmp eq i64 %index.next.4, 1000
- br i1 %30, label %middle.block, label %vector.body
-
-middle.block:
- %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
- %.lcssa = phi <4 x float> [ %28, %vector.body ]
- %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
- %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
- %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
- %31 = extractelement <4 x float> %bin.rdx13, i32 0
- ret float %31
-}
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Interleaved Access Pass
+; CHECK-NEXT: X86 Partial Reduction
; CHECK-NEXT: Expand indirectbr instructions
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2
; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3
+; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: addq $16, %rcx
; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4
; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5
; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6
+; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
+; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
+; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $16, %rcx
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3
; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4
-; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
+; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB3_1
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm3
; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
; SSE2-NEXT: psraw $8, %xmm6
; SSE2-NEXT: pmaddwd %xmm5, %xmm6
-; SSE2-NEXT: paddd %xmm6, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: paddd %xmm6, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm4
; SSE2-NEXT: pmaddwd %xmm3, %xmm4
-; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB6_1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB6_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm2
-; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm3
-; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4
-; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2
+; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3
; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4
; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: addq $16, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB6_1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB7_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm10
-; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm7
-; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm9
-; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7
+; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10
+; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm0
+; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
; SSE2-NEXT: psraw $8, %xmm6
; SSE2-NEXT: pmaddwd %xmm5, %xmm6
-; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT: paddd %xmm6, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm0
; SSE2-NEXT: pmaddwd %xmm5, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE2-NEXT: paddd %xmm0, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
; SSE2-NEXT: psraw $8, %xmm5
; SSE2-NEXT: pmaddwd %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm5, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
; SSE2-NEXT: psraw $8, %xmm5
; SSE2-NEXT: pmaddwd %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: addq $32, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB7_1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB7_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3
-; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm4
-; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm5
-; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm6
-; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7
+; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3
+; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4
+; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5
+; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6
+; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7
; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3
-; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7
-; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7
; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7
+; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7
; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: addq $32, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB7_1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB7_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3
-; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4
-; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5
+; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3
+; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
+; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3
-; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
+; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: addq $32, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB7_1
;
; AVX1-LABEL: pmaddwd_16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX1-LABEL: pmaddwd_32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: pmaddwd_32:
;
; AVX512F-LABEL: pmaddwd_32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX1-LABEL: jumbled_indices8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX1-LABEL: jumbled_indices16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: jumbled_indices16:
;
; AVX512F-LABEL: jumbled_indices16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX1-LABEL: jumbled_indices32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm12
-; AVX1-NEXT: vpmaddwd %xmm12, %xmm8, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12
-; AVX1-NEXT: vpmaddwd %xmm12, %xmm9, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm12
-; AVX1-NEXT: vpmaddwd %xmm12, %xmm10, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm12
-; AVX1-NEXT: vpmaddwd %xmm12, %xmm11, %xmm11
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: retq
;
; AVX2-LABEL: jumbled_indices32:
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: movdqu (%r8), %xmm0
; SSE2-NEXT: movdqu (%r9), %xmm3
; SSE2-NEXT: pmaddwd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: movdqu (%rax), %xmm0
-; SSE2-NEXT: movdqu (%r10), %xmm2
-; SSE2-NEXT: pmaddwd %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: movdqu (%r10), %xmm1
+; SSE2-NEXT: pmaddwd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu (%r8), %xmm2
; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2
-; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovdqu (%rax), %xmm2
; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2
-; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB8_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3
-; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4
-; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5
+; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3
+; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
+; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1
-; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3
-; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
+; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
+; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; CHECK-NEXT: addq $32, %rcx
; CHECK-NEXT: cmpq %rcx, %rax
; CHECK-NEXT: jne .LBB8_1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa a+1040(%rax), %xmm3
-; SSE2-NEXT: psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
; SSE2-NEXT: psadbw b+1024(%rax), %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm3
+; SSE2-NEXT: psadbw b+1040(%rax), %xmm3
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB1_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm2
-; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3
-; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm2
+; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3
+; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: addq $4, %rax
; AVX1-NEXT: jne .LBB1_1
; AVX1-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa a+1056(%rax), %xmm5
-; SSE2-NEXT: psadbw b+1056(%rax), %xmm5
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: movdqa a+1040(%rax), %xmm5
-; SSE2-NEXT: psadbw b+1040(%rax), %xmm5
-; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: movdqa a+1024(%rax), %xmm5
; SSE2-NEXT: psadbw b+1024(%rax), %xmm5
; SSE2-NEXT: paddd %xmm5, %xmm0
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm5
+; SSE2-NEXT: psadbw b+1040(%rax), %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm3
+; SSE2-NEXT: movdqa a+1056(%rax), %xmm5
+; SSE2-NEXT: psadbw b+1056(%rax), %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: movdqa a+1072(%rax), %xmm5
; SSE2-NEXT: psadbw b+1072(%rax), %xmm5
; SSE2-NEXT: paddd %xmm5, %xmm1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB2_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm3
-; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm4
-; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3
-; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm4
-; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3
+; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm4
+; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm5
+; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5
+; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6
+; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: addq $4, %rax
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3
-; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3
; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3
; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3
+; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB2_1
; AVX2-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB2_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm2
-; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm3
-; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm3
+; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB2_1
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psadbw %xmm3, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: psadbw %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB3_1
; SSE2-NEXT: # %bb.2: # %middle.block
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX-NEXT: addq $4, %rax
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: addq $4, %rax
; AVX-NEXT: jne .LBB4_1
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu (%rsi), %xmm1
; SSE2-NEXT: psadbw %xmm0, %xmm1
+; SSE2-NEXT: movdqu (%rdx), %xmm0
+; SSE2-NEXT: movdqu (%rcx), %xmm2
+; SSE2-NEXT: psadbw %xmm0, %xmm2
; SSE2-NEXT: movl $1, %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movdqu (%rdx), %xmm2
-; SSE2-NEXT: movdqu (%rcx), %xmm3
-; SSE2-NEXT: psadbw %xmm2, %xmm3
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
-; AVX1-LABEL: sad_unroll_nonzero_initial:
-; AVX1: # %bb.0: # %bb
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu (%rdx), %xmm1
-; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sad_unroll_nonzero_initial:
-; AVX2: # %bb.0: # %bb
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vmovdqu (%rdx), %xmm2
-; AVX2-NEXT: vpsadbw (%rcx), %xmm2, %xmm2
-; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sad_unroll_nonzero_initial:
-; AVX512: # %bb.0: # %bb
-; AVX512-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vmovdqu (%rdx), %xmm2
-; AVX512-NEXT: vpsadbw (%rcx), %xmm2, %xmm2
-; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: sad_unroll_nonzero_initial:
+; AVX: # %bb.0: # %bb
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqu (%rdx), %xmm1
+; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
+; AVX-NEXT: movl $1, %eax
+; AVX-NEXT: vmovd %eax, %xmm2
+; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
bb:
%tmp = load <16 x i8>, <16 x i8>* %arg, align 1
%tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]