From 9f7d4150b9ec638a048c183c21a355195fdc4942 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 26 Mar 2020 11:09:08 -0700 Subject: [PATCH] [X86] Move combineLoopMAddPattern and combineLoopSADPattern to an IR pass before SelecitonDAG. These transforms rely on a vector reduction flag on the SDNode set by SelectionDAGBuilder. This flag exists because SelectionDAG can't see across basic blocks so SelectionDAGBuilder is looking across and saving the info. X86 is the only target that uses this flag currently. By removing the X86 code we can remove the flag and the SelectionDAGBuilder code. This pass adds a dedicated IR pass for X86 that looks across the blocks and transforms the IR into a form that the X86 SelectionDAG can finish. An advantage of this new approach is that we can enhance it to shrink the phi nodes and final reduction tree based on the zeroes that we need to concatenate to bring the partially reduced reduction back up to the original width. Differential Revision: https://reviews.llvm.org/D76649 --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 9 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 - .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 138 ------- .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 3 - llvm/lib/Target/X86/CMakeLists.txt | 1 + llvm/lib/Target/X86/X86.h | 6 + llvm/lib/Target/X86/X86ISelLowering.cpp | 155 +------ llvm/lib/Target/X86/X86PartialReduction.cpp | 460 +++++++++++++++++++++ llvm/lib/Target/X86/X86TargetMachine.cpp | 5 +- llvm/test/CodeGen/Generic/vector-redux.ll | 237 ----------- llvm/test/CodeGen/X86/O3-pipeline.ll | 1 + llvm/test/CodeGen/X86/madd.ll | 218 +++++----- llvm/test/CodeGen/X86/min-legal-vector-width.ll | 12 +- llvm/test/CodeGen/X86/sad.ll | 183 ++++---- 14 files changed, 669 insertions(+), 766 deletions(-) create mode 100644 llvm/lib/Target/X86/X86PartialReduction.cpp delete mode 100644 llvm/test/CodeGen/Generic/vector-redux.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index faee8e8..17b6636 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -368,7 +368,6 @@ private: bool NoInfs : 1; bool NoSignedZeros : 1; bool AllowReciprocal : 1; - bool VectorReduction : 1; bool AllowContract : 1; bool ApproximateFuncs : 1; bool AllowReassociation : 1; @@ -385,7 +384,7 @@ public: SDNodeFlags() : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NoNaNs(false), NoInfs(false), - NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false), + NoSignedZeros(false), AllowReciprocal(false), AllowContract(false), ApproximateFuncs(false), AllowReassociation(false), NoFPExcept(false) {} @@ -434,10 +433,6 @@ public: setDefined(); AllowReciprocal = b; } - void setVectorReduction(bool b) { - setDefined(); - VectorReduction = b; - } void setAllowContract(bool b) { setDefined(); AllowContract = b; @@ -463,7 +458,6 @@ public: bool hasNoInfs() const { return NoInfs; } bool hasNoSignedZeros() const { return NoSignedZeros; } bool hasAllowReciprocal() const { return AllowReciprocal; } - bool hasVectorReduction() const { return VectorReduction; } bool hasAllowContract() const { return AllowContract; } bool hasApproximateFuncs() const { return ApproximateFuncs; } bool hasAllowReassociation() const { return AllowReassociation; } @@ -481,7 +475,6 @@ public: NoInfs &= Flags.NoInfs; NoSignedZeros &= Flags.NoSignedZeros; AllowReciprocal &= Flags.AllowReciprocal; - VectorReduction &= Flags.VectorReduction; AllowContract &= Flags.AllowContract; ApproximateFuncs &= Flags.ApproximateFuncs; AllowReassociation &= Flags.AllowReassociation; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 574a80f..0763a5e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -967,10 +967,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, if (N0.getOpcode() != Opc) return SDValue(); - // Don't reassociate reductions. - if (N0->getFlags().hasVectorReduction()) - return SDValue(); - if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) @@ -995,9 +991,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags) { assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative."); - // Don't reassociate reductions. - if (Flags.hasVectorReduction()) - return SDValue(); // Floating-point reassociation is not allowed without loose FP math. if (N0.getValueType().isFloatingPoint() || diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1ad8620..d472ef9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2998,133 +2998,6 @@ void SelectionDAGBuilder::visitFSub(const User &I) { visitBinary(I, ISD::FSUB); } -/// Checks if the given instruction performs a vector reduction, in which case -/// we have the freedom to alter the elements in the result as long as the -/// reduction of them stays unchanged. -static bool isVectorReductionOp(const User *I) { - const Instruction *Inst = dyn_cast(I); - if (!Inst || !Inst->getType()->isVectorTy()) - return false; - - auto OpCode = Inst->getOpcode(); - switch (OpCode) { - case Instruction::Add: - case Instruction::Mul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - break; - case Instruction::FAdd: - case Instruction::FMul: - if (const FPMathOperator *FPOp = dyn_cast(Inst)) - if (FPOp->getFastMathFlags().isFast()) - break; - LLVM_FALLTHROUGH; - default: - return false; - } - - unsigned ElemNum = Inst->getType()->getVectorNumElements(); - // Ensure the reduction size is a power of 2. - if (!isPowerOf2_32(ElemNum)) - return false; - - unsigned ElemNumToReduce = ElemNum; - - // Do DFS search on the def-use chain from the given instruction. We only - // allow four kinds of operations during the search until we reach the - // instruction that extracts the first element from the vector: - // - // 1. The reduction operation of the same opcode as the given instruction. - // - // 2. PHI node. - // - // 3. ShuffleVector instruction together with a reduction operation that - // does a partial reduction. - // - // 4. ExtractElement that extracts the first element from the vector, and we - // stop searching the def-use chain here. - // - // 3 & 4 above perform a reduction on all elements of the vector. We push defs - // from 1-3 to the stack to continue the DFS. The given instruction is not - // a reduction operation if we meet any other instructions other than those - // listed above. - - SmallVector UsersToVisit{Inst}; - SmallPtrSet Visited; - bool ReduxExtracted = false; - - while (!UsersToVisit.empty()) { - auto User = UsersToVisit.back(); - UsersToVisit.pop_back(); - if (!Visited.insert(User).second) - continue; - - for (const auto *U : User->users()) { - auto Inst = dyn_cast(U); - if (!Inst) - return false; - - if (Inst->getOpcode() == OpCode || isa(U)) { - if (const FPMathOperator *FPOp = dyn_cast(Inst)) - if (!isa(FPOp) && !FPOp->getFastMathFlags().isFast()) - return false; - UsersToVisit.push_back(U); - } else if (const ShuffleVectorInst *ShufInst = - dyn_cast(U)) { - // Detect the following pattern: A ShuffleVector instruction together - // with a reduction that do partial reduction on the first and second - // ElemNumToReduce / 2 elements, and store the result in - // ElemNumToReduce / 2 elements in another vector. - - unsigned ResultElements = ShufInst->getType()->getVectorNumElements(); - if (ResultElements < ElemNum) - return false; - - if (ElemNumToReduce == 1) - return false; - if (!isa(U->getOperand(1))) - return false; - for (unsigned i = 0; i < ElemNumToReduce / 2; ++i) - if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2)) - return false; - for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i) - if (ShufInst->getMaskValue(i) != -1) - return false; - - // There is only one user of this ShuffleVector instruction, which - // must be a reduction operation. - if (!U->hasOneUse()) - return false; - - auto U2 = dyn_cast(*U->user_begin()); - if (!U2 || U2->getOpcode() != OpCode) - return false; - - // Check operands of the reduction operation. - if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) || - (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) { - UsersToVisit.push_back(U2); - ElemNumToReduce /= 2; - } else - return false; - } else if (isa(U)) { - // At this moment we should have reduced all elements in the vector. - if (ElemNumToReduce != 1) - return false; - - const ConstantInt *Val = dyn_cast(U->getOperand(1)); - if (!Val || !Val->isZero()) - return false; - - ReduxExtracted = true; - } else - return false; - } - } - return ReduxExtracted; -} - void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) { SDNodeFlags Flags; @@ -3143,17 +3016,6 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) { if (auto *ExactOp = dyn_cast(&I)) { Flags.setExact(ExactOp->isExact()); } - if (isVectorReductionOp(&I)) { - Flags.setVectorReduction(true); - LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n"); - - // If no flags are set we will propagate the incoming flags, if any flags - // are set, we will intersect them with the incoming flag and so we need to - // copy the FMF flags here. - if (auto *FPOp = dyn_cast(&I)) { - Flags.copyFMF(*FPOp); - } - } SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index aca462f..f81d18c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -553,9 +553,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { if (getFlags().hasAllowReassociation()) OS << " reassoc"; - if (getFlags().hasVectorReduction()) - OS << " vector-reduction"; - if (getFlags().hasNoFPExcept()) OS << " nofpexcept"; diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 3f0d68c..1542fc1 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -56,6 +56,7 @@ set(sources X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp + X86PartialReduction.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp X86RetpolineThunks.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index c81b349..8c0a13c 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -133,6 +133,11 @@ FunctionPass *createX86InsertPrefetchPass(); /// fp exceptions when strict-fp enabled. FunctionPass *createX86InsertX87waitPass(); +/// This pass optimizes arithmetic based on knowledge that is only used by +/// a reduction sequence and is therefore safe to reassociate in interesting +/// ways. +FunctionPass *createX86PartialReductionPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); @@ -154,6 +159,7 @@ void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); +void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); namespace X86AS { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e40055f..f91d7ff 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45902,131 +45902,6 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); } -static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - EVT VT = N->getValueType(0); - - // If the vector size is less than 128, or greater than the supported RegSize, - // do not use PMADD. - if (!VT.isVector() || VT.getVectorNumElements() < 8) - return SDValue(); - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - auto UsePMADDWD = [&](SDValue Op) { - ShrinkMode Mode; - return Op.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op.getNode(), DAG, Mode) && - Mode != ShrinkMode::MULU16 && - (!Subtarget.hasSSE41() || - (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - Op->isOnlyUserOf(Op.getOperand(1).getNode()))); - }; - - SDValue MulOp, OtherOp; - if (UsePMADDWD(Op0)) { - MulOp = Op0; - OtherOp = Op1; - } else if (UsePMADDWD(Op1)) { - MulOp = Op1; - OtherOp = Op0; - } else - return SDValue(); - - SDLoc DL(N); - EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - VT.getVectorNumElements()); - EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - VT.getVectorNumElements() / 2); - - // Shrink the operands of mul. - SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); - - // Madd vector size is half of the original vector size - auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); - }; - SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, - PMADDWDBuilder); - // Fill the rest of the output with 0 - SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType()); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); - - // Preserve the reduction flag on the ADD. We may need to revisit for the - // other operand. - SDNodeFlags Flags; - Flags.setVectorReduction(true); - return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags); -} - -static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - SDLoc DL(N); - EVT VT = N->getValueType(0); - - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. - if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) || - VT.getVectorElementType() != MVT::i32) - return SDValue(); - - // We know N is a reduction add. To match SAD, we need one of the operands to - // be an ABS. - SDValue AbsOp = N->getOperand(0); - SDValue OtherOp = N->getOperand(1); - if (AbsOp.getOpcode() != ISD::ABS) - std::swap(AbsOp, OtherOp); - if (AbsOp.getOpcode() != ISD::ABS) - return SDValue(); - - // Check whether we have an abs-diff pattern feeding into the select. - SDValue SadOp0, SadOp1; - if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1)) - return SDValue(); - - // SAD pattern detected. Now build a SAD instruction and an addition for - // reduction. Note that the number of elements of the result of SAD is less - // than the number of elements of its input. Therefore, we could only update - // part of elements in the reduction vector. - SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget); - - // The output of PSADBW is a vector of i64. - // We need to turn the vector of i64 into a vector of i32. - // If the reduction vector is at least as wide as the psadbw result, just - // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of - // the PSADBW will be zero. - MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); - Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); - - if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Fill the upper elements with zero to match the add width. - assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"); - unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits(); - SmallVector Ops(NumConcats, DAG.getConstant(0, DL, ResVT)); - Ops[0] = Sad; - Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); - } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) { - Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad, - DAG.getIntPtrConstant(0, DL)); - } - - // Preserve the reduction flag on the ADD. We may need to revisit for the - // other operand. - SDNodeFlags Flags; - Flags.setVectorReduction(true); - return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags); -} - static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -46116,30 +45991,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, Mode == ShrinkMode::MULU16) return SDValue(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + VT.getVectorNumElements() * 2); + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1)); + auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - // Shrink by adding truncate nodes and let DAGCombine fold with the - // sources. EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i32 && - "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements() / 2); - EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - InVT.getVectorNumElements()); - return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, - DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]), - DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1])); + return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Mul.getOperand(0), Mul.getOperand(1) }, - PMADDBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder); } // Attempt to turn this pattern into PMADDWD. -// (mul (add (sext (build_vector)), (sext (build_vector))), -// (add (sext (build_vector)), (sext (build_vector))) +// (add (mul (sext (build_vector)), (sext (build_vector))), +// (mul (sext (build_vector)), (sext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -46261,13 +46131,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - const SDNodeFlags Flags = N->getFlags(); - if (Flags.hasVectorReduction()) { - if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) - return Sad; - if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) - return MAdd; - } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp new file mode 100644 index 0000000..4cd231d --- /dev/null +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -0,0 +1,460 @@ +//===-- X86PartialReduction.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass looks for add instructions used by a horizontal reduction to see +// if we might be able to use pmaddwd or psadbw. Some cases of this require +// cross basic block knowledge and can't be done in SelectionDAG. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "X86TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-partial-reduction" + +namespace { + +class X86PartialReduction : public FunctionPass { + const DataLayout *DL; + const X86Subtarget *ST; + +public: + static char ID; // Pass identification, replacement for typeid. + + X86PartialReduction() : FunctionPass(ID) { } + + bool runOnFunction(Function &Fn) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + StringRef getPassName() const override { + return "X86 Partial Reduction"; + } + +private: + bool tryMAddPattern(BinaryOperator *BO); + bool tryMAddReplacement(Value *Op, BinaryOperator *Add); + + bool trySADPattern(BinaryOperator *BO); + bool trySADReplacement(Value *Op, BinaryOperator *Add); +}; +} + +FunctionPass *llvm::createX86PartialReductionPass() { + return new X86PartialReduction(); +} + +char X86PartialReduction::ID = 0; + +INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, + "X86 Partial Reduction", false, false) + +static bool isVectorReductionOp(const BinaryOperator &BO) { + if (!BO.getType()->isVectorTy()) + return false; + + unsigned Opcode = BO.getOpcode(); + + switch (Opcode) { + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + break; + case Instruction::FAdd: + case Instruction::FMul: + if (auto *FPOp = dyn_cast(&BO)) + if (FPOp->getFastMathFlags().isFast()) + break; + LLVM_FALLTHROUGH; + default: + return false; + } + + unsigned ElemNum = BO.getType()->getVectorNumElements(); + // Ensure the reduction size is a power of 2. + if (!isPowerOf2_32(ElemNum)) + return false; + + unsigned ElemNumToReduce = ElemNum; + + // Do DFS search on the def-use chain from the given instruction. We only + // allow four kinds of operations during the search until we reach the + // instruction that extracts the first element from the vector: + // + // 1. The reduction operation of the same opcode as the given instruction. + // + // 2. PHI node. + // + // 3. ShuffleVector instruction together with a reduction operation that + // does a partial reduction. + // + // 4. ExtractElement that extracts the first element from the vector, and we + // stop searching the def-use chain here. + // + // 3 & 4 above perform a reduction on all elements of the vector. We push defs + // from 1-3 to the stack to continue the DFS. The given instruction is not + // a reduction operation if we meet any other instructions other than those + // listed above. + + SmallVector UsersToVisit{&BO}; + SmallPtrSet Visited; + bool ReduxExtracted = false; + + while (!UsersToVisit.empty()) { + auto User = UsersToVisit.back(); + UsersToVisit.pop_back(); + if (!Visited.insert(User).second) + continue; + + for (const auto *U : User->users()) { + auto *Inst = dyn_cast(U); + if (!Inst) + return false; + + if (Inst->getOpcode() == Opcode || isa(U)) { + if (auto *FPOp = dyn_cast(Inst)) + if (!isa(FPOp) && !FPOp->getFastMathFlags().isFast()) + return false; + UsersToVisit.push_back(U); + } else if (auto *ShufInst = dyn_cast(U)) { + // Detect the following pattern: A ShuffleVector instruction together + // with a reduction that do partial reduction on the first and second + // ElemNumToReduce / 2 elements, and store the result in + // ElemNumToReduce / 2 elements in another vector. + + unsigned ResultElements = ShufInst->getType()->getVectorNumElements(); + if (ResultElements < ElemNum) + return false; + + if (ElemNumToReduce == 1) + return false; + if (!isa(U->getOperand(1))) + return false; + for (unsigned i = 0; i < ElemNumToReduce / 2; ++i) + if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2)) + return false; + for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i) + if (ShufInst->getMaskValue(i) != -1) + return false; + + // There is only one user of this ShuffleVector instruction, which + // must be a reduction operation. + if (!U->hasOneUse()) + return false; + + auto *U2 = dyn_cast(*U->user_begin()); + if (!U2 || U2->getOpcode() != Opcode) + return false; + + // Check operands of the reduction operation. + if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) || + (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) { + UsersToVisit.push_back(U2); + ElemNumToReduce /= 2; + } else + return false; + } else if (isa(U)) { + // At this moment we should have reduced all elements in the vector. + if (ElemNumToReduce != 1) + return false; + + auto *Val = dyn_cast(U->getOperand(1)); + if (!Val || !Val->isZero()) + return false; + + ReduxExtracted = true; + } else + return false; + } + } + return ReduxExtracted; +} + +bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { + BasicBlock *BB = Add->getParent(); + + auto *BO = dyn_cast(Op); + if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() || + BO->getParent() != BB) + return false; + + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + // LHS and RHS should be only used once or if they are the same then only + // used twice. Only check this when SSE4.1 is enabled and we have zext/sext + // instructions, otherwise we use punpck to emulate zero extend in stages. The + // trunc/ we need to do likely won't introduce new instructions in that case. + if (ST->hasSSE41()) { + if (LHS == RHS) { + if (!isa(LHS) && !LHS->hasNUses(2)) + return false; + } else { + if (!isa(LHS) && !LHS->hasOneUse()) + return false; + if (!isa(RHS) && !RHS->hasOneUse()) + return false; + } + } + + auto canShrinkOp = [&](Value *Op) { + if (isa(Op) && ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + return true; + if (auto *Cast = dyn_cast(Op)) { + if (Cast->getParent() == BB && + (Cast->getOpcode() == Instruction::SExt || + Cast->getOpcode() == Instruction::ZExt) && + ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + return true; + } + + return false; + }; + + // Both Ops need to be shrinkable. + if (!canShrinkOp(LHS) && !canShrinkOp(RHS)) + return false; + + IRBuilder<> Builder(Add); + + Type *MulTy = Op->getType(); + unsigned NumElts = MulTy->getVectorNumElements(); + + // Extract even elements and odd elements and add them together. This will + // be pattern matched by SelectionDAG to pmaddwd. This instruction will be + // half the original width. + SmallVector EvenMask(NumElts / 2); + SmallVector OddMask(NumElts / 2); + for (int i = 0, e = NumElts / 2; i != e; ++i) { + EvenMask[i] = i * 2; + OddMask[i] = i * 2 + 1; + } + Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask); + Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask); + Value *MAdd = Builder.CreateAdd(EvenElts, OddElts); + + // Concatenate zeroes to extend back to the original type. + SmallVector ConcatMask(NumElts); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Value *Zero = Constant::getNullValue(MAdd->getType()); + Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask); + + // Replaces the use of mul in the original Add with the pmaddwd and zeroes. + Add->replaceUsesOfWith(BO, Concat); + Add->setHasNoSignedWrap(false); + Add->setHasNoUnsignedWrap(false); + + return true; +} + +// Try to replace operans of this add with pmaddwd patterns. +bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) { + if (!ST->hasSSE2()) + return false; + + // Need at least 8 elements. + if (BO->getType()->getVectorNumElements() < 8) + return false; + + // Element type should be i32. + if (!BO->getType()->getVectorElementType()->isIntegerTy(32)) + return false; + + bool Changed = false; + Changed |= tryMAddReplacement(BO->getOperand(0), BO); + Changed |= tryMAddReplacement(BO->getOperand(1), BO); + return Changed; +} + +bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { + // Operand should be a select. + auto *SI = dyn_cast(Op); + if (!SI) + return false; + + // Select needs to implement absolute value. + Value *LHS, *RHS; + auto SPR = matchSelectPattern(SI, LHS, RHS); + if (SPR.Flavor != SPF_ABS) + return false; + + // Need a subtract of two values. + auto *Sub = dyn_cast(LHS); + if (!Sub || Sub->getOpcode() != Instruction::Sub) + return false; + + // Look for zero extend from i8. + auto getZeroExtendedVal = [](Value *Op) -> Value * { + if (auto *ZExt = dyn_cast(Op)) + if (ZExt->getOperand(0)->getType()->getVectorElementType()->isIntegerTy(8)) + return ZExt->getOperand(0); + + return nullptr; + }; + + // Both operands of the subtract should be extends from vXi8. + Value *Op0 = getZeroExtendedVal(Sub->getOperand(0)); + Value *Op1 = getZeroExtendedVal(Sub->getOperand(1)); + if (!Op0 || !Op1) + return false; + + IRBuilder<> Builder(Add); + + Type *OpTy = Op->getType(); + unsigned NumElts = OpTy->getVectorNumElements(); + + unsigned IntrinsicNumElts; + Intrinsic::ID IID; + if (ST->hasBWI() && NumElts >= 64) { + IID = Intrinsic::x86_avx512_psad_bw_512; + IntrinsicNumElts = 64; + } else if (ST->hasAVX2() && NumElts >= 32) { + IID = Intrinsic::x86_avx2_psad_bw; + IntrinsicNumElts = 32; + } else { + IID = Intrinsic::x86_sse2_psad_bw; + IntrinsicNumElts = 16; + } + + Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID); + + if (NumElts < 16) { + // Pad input with zeroes. + SmallVector ConcatMask(16); + for (unsigned i = 0; i != NumElts; ++i) + ConcatMask[i] = i; + for (unsigned i = NumElts; i != 16; ++i) + ConcatMask[i] = (i % NumElts) + NumElts; + + Value *Zero = Constant::getNullValue(Op0->getType()); + Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask); + Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask); + NumElts = 16; + } + + // Intrinsics produce vXi64 and need to be casted to vXi32. + Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4); + + assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!"); + unsigned NumSplits = NumElts / IntrinsicNumElts; + + // First collect the pieces we need. + SmallVector Ops(NumSplits); + for (unsigned i = 0; i != NumSplits; ++i) { + SmallVector ExtractMask(IntrinsicNumElts); + std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts); + Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask); + Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask); + Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1}); + Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty); + } + + assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits"); + unsigned Stages = Log2_32(NumSplits); + for (unsigned s = Stages; s > 0; --s) { + unsigned NumConcatElts = Ops[0]->getType()->getVectorNumElements() * 2; + for (unsigned i = 0; i != 1 << (s - 1); ++i) { + SmallVector ConcatMask(NumConcatElts); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask); + } + } + + // At this point the final value should be in Ops[0]. Now we need to adjust + // it to the final original type. + NumElts = OpTy->getVectorNumElements(); + if (NumElts == 2) { + // Extract down to 2 elements. + Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], {0, 1}); + } else if (NumElts >= 8) { + SmallVector ConcatMask(NumElts); + unsigned SubElts = Ops[0]->getType()->getVectorNumElements(); + for (unsigned i = 0; i != SubElts; ++i) + ConcatMask[i] = i; + for (unsigned i = SubElts; i != NumElts; ++i) + ConcatMask[i] = (i % SubElts) + SubElts; + + Value *Zero = Constant::getNullValue(Ops[0]->getType()); + Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); + } + + // Replaces the uses of Op in Add with the new sequence. + Add->replaceUsesOfWith(Op, Ops[0]); + Add->setHasNoSignedWrap(false); + Add->setHasNoUnsignedWrap(false); + + return false; +} + +bool X86PartialReduction::trySADPattern(BinaryOperator *BO) { + if (!ST->hasSSE2()) + return false; + + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + if (!BO->getType()->getVectorElementType()->isIntegerTy(32)) + return false; + + bool Changed = false; + Changed |= trySADReplacement(BO->getOperand(0), BO); + Changed |= trySADReplacement(BO->getOperand(1), BO); + return Changed; +} + +bool X86PartialReduction::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + auto &TM = TPC->getTM(); + ST = TM.getSubtargetImpl(F); + + DL = &F.getParent()->getDataLayout(); + + bool MadeChange = false; + for (auto &BB : F) { + for (auto &I : BB) { + auto *BO = dyn_cast(&I); + if (!BO) + continue; + + if (!isVectorReductionOp(*BO)) + continue; + + if (BO->getOpcode() == Instruction::Add) { + if (tryMAddPattern(BO)) { + MadeChange = true; + continue; + } + if (trySADPattern(BO)) { + MadeChange = true; + continue; + } + } + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index dd6b678..d80c82c 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -84,6 +84,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); initializeX86OptimizeLEAPassPass(PR); + initializeX86PartialReductionPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -400,8 +401,10 @@ void X86PassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + addPass(createX86PartialReductionPass()); + } // Add passes that handle indirect branch removal and insertion of a retpoline // thunk. These will be a no-op unless a function subtarget has the retpoline diff --git a/llvm/test/CodeGen/Generic/vector-redux.ll b/llvm/test/CodeGen/Generic/vector-redux.ll deleted file mode 100644 index 8efdbf8..0000000 --- a/llvm/test/CodeGen/Generic/vector-redux.ll +++ /dev/null @@ -1,237 +0,0 @@ -; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s -; REQUIRES: asserts - -@a = global [1024 x i32] zeroinitializer, align 16 - -define i32 @reduce_add() { -; CHECK-LABEL: reduce_add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add -; CHECK: Detected a reduction operation: {{.*}} add - -min.iters.checked: - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ] - %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ] - %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ] - %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index - %1 = bitcast i32* %0 to <4 x i32>* - %wide.load = load <4 x i32>, <4 x i32>* %1, align 16 - %2 = getelementptr i32, i32* %0, i64 4 - %3 = bitcast i32* %2 to <4 x i32>* - %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16 - %4 = add nsw <4 x i32> %wide.load, %vec.phi - %5 = add nsw <4 x i32> %wide.load5, %vec.phi4 - %index.next = add nuw nsw i64 %index, 8 - %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next - %7 = bitcast i32* %6 to <4 x i32>* - %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16 - %8 = getelementptr i32, i32* %6, i64 4 - %9 = bitcast i32* %8 to <4 x i32>* - %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16 - %10 = add nsw <4 x i32> %wide.load.1, %4 - %11 = add nsw <4 x i32> %wide.load5.1, %5 - %index.next.1 = add nsw i64 %index, 16 - %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1 - %13 = bitcast i32* %12 to <4 x i32>* - %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16 - %14 = getelementptr i32, i32* %12, i64 4 - %15 = bitcast i32* %14 to <4 x i32>* - %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16 - %16 = add nsw <4 x i32> %wide.load.2, %10 - %17 = add nsw <4 x i32> %wide.load5.2, %11 - %index.next.2 = add nsw i64 %index, 24 - %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2 - %19 = bitcast i32* %18 to <4 x i32>* - %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16 - %20 = getelementptr i32, i32* %18, i64 4 - %21 = bitcast i32* %20 to <4 x i32>* - %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16 - %22 = add nsw <4 x i32> %wide.load.3, %16 - %23 = add nsw <4 x i32> %wide.load5.3, %17 - %index.next.3 = add nsw i64 %index, 32 - %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3 - %25 = bitcast i32* %24 to <4 x i32>* - %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16 - %26 = getelementptr i32, i32* %24, i64 4 - %27 = bitcast i32* %26 to <4 x i32>* - %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16 - %28 = add nsw <4 x i32> %wide.load.4, %22 - %29 = add nsw <4 x i32> %wide.load5.4, %23 - %index.next.4 = add nsw i64 %index, 40 - %30 = icmp eq i64 %index.next.4, 1000 - br i1 %30, label %middle.block, label %vector.body - -middle.block: - %.lcssa10 = phi <4 x i32> [ %29, %vector.body ] - %.lcssa = phi <4 x i32> [ %28, %vector.body ] - %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa - %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> - %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf - %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> - %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7 - %31 = extractelement <4 x i32> %bin.rdx8, i32 0 - ret i32 %31 -} - -define i32 @reduce_and() { -; CHECK-LABEL: reduce_and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and -; CHECK: Detected a reduction operation: {{.*}} and - -entry: - br label %vector.body - -vector.body: - %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ] - %vec.phi = phi <4 x i32> [ , %entry ], [ %6, %vector.body ] - %vec.phi9 = phi <4 x i32> [ , %entry ], [ %7, %vector.body ] - %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>* - %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256 - %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16 - %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257 - %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16 - %0 = and <4 x i32> %wide.load, %vec.phi - %1 = and <4 x i32> %wide.load10, %vec.phi9 - %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>* - %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258 - %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16 - %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>* - %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259 - %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16 - %2 = and <4 x i32> %wide.load.1, %0 - %3 = and <4 x i32> %wide.load10.1, %1 - %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>* - %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260 - %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16 - %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>* - %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261 - %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16 - %4 = and <4 x i32> %wide.load.2, %2 - %5 = and <4 x i32> %wide.load10.2, %3 - %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>* - %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262 - %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16 - %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv - %uglygep17 = bitcast i8* %uglygep to <4 x i32>* - %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263 - %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16 - %6 = and <4 x i32> %wide.load.3, %4 - %7 = and <4 x i32> %wide.load10.3, %5 - %lsr.iv.next = add nsw i64 %lsr.iv, 128 - %8 = icmp eq i64 %lsr.iv.next, 0 - br i1 %8, label %middle.block, label %vector.body - -middle.block: - %bin.rdx = and <4 x i32> %7, %6 - %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> - %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf - %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> - %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12 - %9 = extractelement <4 x i32> %bin.rdx13, i32 0 - ret i32 %9 -} - -define float @reduce_add_float(float* nocapture readonly %a) { -; CHECK-LABEL: reduce_add_float -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; CHECK: Detected a reduction operation: {{.*}} fadd fast -; -entry: - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ] - %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ] - %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ] - %0 = getelementptr inbounds float, float* %a, i64 %index - %1 = bitcast float* %0 to <4 x float>* - %wide.load = load <4 x float>, <4 x float>* %1, align 4 - %2 = getelementptr float, float* %0, i64 4 - %3 = bitcast float* %2 to <4 x float>* - %wide.load10 = load <4 x float>, <4 x float>* %3, align 4 - %4 = fadd fast <4 x float> %wide.load, %vec.phi - %5 = fadd fast <4 x float> %wide.load10, %vec.phi9 - %index.next = add nuw nsw i64 %index, 8 - %6 = getelementptr inbounds float, float* %a, i64 %index.next - %7 = bitcast float* %6 to <4 x float>* - %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4 - %8 = getelementptr float, float* %6, i64 4 - %9 = bitcast float* %8 to <4 x float>* - %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4 - %10 = fadd fast <4 x float> %wide.load.1, %4 - %11 = fadd fast <4 x float> %wide.load10.1, %5 - %index.next.1 = add nsw i64 %index, 16 - %12 = getelementptr inbounds float, float* %a, i64 %index.next.1 - %13 = bitcast float* %12 to <4 x float>* - %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4 - %14 = getelementptr float, float* %12, i64 4 - %15 = bitcast float* %14 to <4 x float>* - %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4 - %16 = fadd fast <4 x float> %wide.load.2, %10 - %17 = fadd fast <4 x float> %wide.load10.2, %11 - %index.next.2 = add nsw i64 %index, 24 - %18 = getelementptr inbounds float, float* %a, i64 %index.next.2 - %19 = bitcast float* %18 to <4 x float>* - %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4 - %20 = getelementptr float, float* %18, i64 4 - %21 = bitcast float* %20 to <4 x float>* - %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4 - %22 = fadd fast <4 x float> %wide.load.3, %16 - %23 = fadd fast <4 x float> %wide.load10.3, %17 - %index.next.3 = add nsw i64 %index, 32 - %24 = getelementptr inbounds float, float* %a, i64 %index.next.3 - %25 = bitcast float* %24 to <4 x float>* - %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4 - %26 = getelementptr float, float* %24, i64 4 - %27 = bitcast float* %26 to <4 x float>* - %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4 - %28 = fadd fast <4 x float> %wide.load.4, %22 - %29 = fadd fast <4 x float> %wide.load10.4, %23 - %index.next.4 = add nsw i64 %index, 40 - %30 = icmp eq i64 %index.next.4, 1000 - br i1 %30, label %middle.block, label %vector.body - -middle.block: - %.lcssa15 = phi <4 x float> [ %29, %vector.body ] - %.lcssa = phi <4 x float> [ %28, %vector.body ] - %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa - %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> - %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf - %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> - %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12 - %31 = extractelement <4 x float> %bin.rdx13, i32 0 - ret float %31 -} diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll index c72676b..f22c701 100644 --- a/llvm/test/CodeGen/X86/O3-pipeline.ll +++ b/llvm/test/CodeGen/X86/O3-pipeline.ll @@ -51,6 +51,7 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Interleaved Access Pass +; CHECK-NEXT: X86 Partial Reduction ; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 43bb8ee..cad6f61 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -236,10 +236,10 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 ; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx @@ -407,16 +407,16 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 ; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 ; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 +; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 +; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 +; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 ; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx @@ -453,10 +453,10 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 ; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB3_1 @@ -779,18 +779,18 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm3 ; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE2-NEXT: psraw $8, %xmm6 ; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: pmaddwd %xmm3, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 @@ -814,16 +814,16 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm2 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 ; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB6_1 @@ -943,34 +943,34 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm10 -; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm7 -; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm9 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 +; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm0 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm6 ; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm0 ; SSE2-NEXT: pmaddwd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: pmaddwd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: pmaddwd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 @@ -999,26 +999,26 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm4 -; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm5 -; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm6 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB7_1 @@ -1051,14 +1051,14 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB7_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 -; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3 -; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $32, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB7_1 @@ -1913,9 +1913,9 @@ define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { ; ; AVX1-LABEL: pmaddwd_16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1944,16 +1944,16 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { ; ; AVX1-LABEL: pmaddwd_32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddwd_32: @@ -1964,9 +1964,9 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { ; ; AVX512F-LABEL: pmaddwd_32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -2126,9 +2126,9 @@ define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { ; ; AVX1-LABEL: jumbled_indices8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2157,16 +2157,16 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { ; ; AVX1-LABEL: jumbled_indices16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: jumbled_indices16: @@ -2177,9 +2177,9 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { ; ; AVX512F-LABEL: jumbled_indices16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -2221,26 +2221,26 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { ; ; AVX1-LABEL: jumbled_indices32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm8, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm9, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm10, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm11, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: retq ; ; AVX2-LABEL: jumbled_indices32: @@ -2656,7 +2656,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] @@ -2698,14 +2698,14 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; SSE2-NEXT: movdqu (%r8), %xmm0 ; SSE2-NEXT: movdqu (%r9), %xmm3 ; SSE2-NEXT: pmaddwd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm3 ; SSE2-NEXT: movdqu (%rax), %xmm0 -; SSE2-NEXT: movdqu (%r10), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqu (%r10), %xmm1 +; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -2721,11 +2721,11 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%r8), %xmm2 ; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rax), %xmm2 ; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 543da1e..44ed303 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -180,14 +180,14 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3 -; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 ; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-NEXT: addq $32, %rcx ; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: jne .LBB8_1 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 64845c8..6a74206 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -158,12 +158,12 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 -; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 +; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block @@ -188,14 +188,14 @@ define i32 @sad_32i8() nounwind { ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm2 -; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3 -; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm2 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %middle.block @@ -320,15 +320,15 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 -; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 -; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm5 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm5 ; SSE2-NEXT: psadbw b+1072(%rax), %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm1 @@ -364,22 +364,22 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm3 -; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm4 -; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3 -; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm4 -; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm4 +; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm5 +; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6 +; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %middle.block @@ -416,12 +416,12 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 -; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3 ; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 +; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %middle.block @@ -449,11 +449,11 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB2_1: # %vector.body ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm2 -; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm3 -; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 +; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm3 +; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 @@ -554,10 +554,10 @@ define i32 @sad_2i8() nounwind { ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psadbw %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: psadbw %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block @@ -576,8 +576,8 @@ define i32 @sad_2i8() nounwind { ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX-NEXT: addq $4, %rax @@ -649,7 +649,7 @@ define i32 @sad_4i8() nounwind { ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: addq $4, %rax ; AVX-NEXT: jne .LBB4_1 @@ -987,75 +987,36 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movdqu (%rdx), %xmm0 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: movl $1, %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movdqu (%rdx), %xmm2 -; SSE2-NEXT: movdqu (%rcx), %xmm3 -; SSE2-NEXT: psadbw %xmm2, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: sad_unroll_nonzero_initial: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: sad_unroll_nonzero_initial: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vmovdqu (%rdx), %xmm2 -; AVX2-NEXT: vpsadbw (%rcx), %xmm2, %xmm2 -; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: sad_unroll_nonzero_initial: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vmovdqu (%rdx), %xmm2 -; AVX512-NEXT: vpsadbw (%rcx), %xmm2, %xmm2 -; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: sad_unroll_nonzero_initial: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 @@ -1112,7 +1073,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -- 2.7.4