From: Simon Pilgrim Date: Tue, 28 Jul 2020 08:52:38 +0000 (+0100) Subject: [X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)) X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=182111777b4ec215eeebe8ab5cc2a324e2f055ff;p=platform%2Fupstream%2Fllvm.git [X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)) An initial backend patch towards fixing the various poor HADD combines (PR34724, PR41813, PR45747 etc.). This extends isHorizontalBinOp to check if we have per-element horizontal ops (odd+even element pairs), but not in the expected serial order - in which case we build a "post shuffle mask" that we can apply to the HOP result, assuming we have fast-hops/optsize etc. The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can fold it into another shuffle. Differential Revision: https://reviews.llvm.org/D83789 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 390a20c..5eadd9c2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44371,8 +44371,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - bool IsCommutative) { + const X86Subtarget &Subtarget, bool IsCommutative, + SmallVectorImpl &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -44465,6 +44465,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, RMask.push_back(i); } + // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split). + if (!Subtarget.hasAVX2() && VT.isFloatingPoint() && + (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) || + isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask))) + return false; + // If A and B occur in reverse order in RHS, then canonicalize by commuting // RHS operands and shuffle mask. if (A != C) { @@ -44475,6 +44481,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, if (!(A == C && B == D)) return false; + PostShuffleMask.clear(); + PostShuffleMask.append(NumElts, SM_SentinelUndef); + // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask @@ -44483,6 +44492,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; + unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { @@ -44494,25 +44504,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; + // Check that successive odd/even elements are being operated on. If not, + // this is not a horizontal operation. + if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && + !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) + return false; + + // Compute the post-shuffle mask index based on where the element + // is stored in the HOP result, and where it needs to be moved to. + int Base = LIdx & ~1u; + int Index = ((Base % NumEltsPer128BitChunk) / 2) + + ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); + // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; - - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) + Index += NumEltsPer64BitChunk; + PostShuffleMask[i + j] = Index; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. - if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + bool IsIdentityPostShuffle = + isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); + if (IsIdentityPostShuffle) + PostShuffleMask.clear(); + + // Assume a SingleSource HOP if we only shuffle one input and don't need to + // shuffle the result. + if (!shouldUseHorizontalOp(LHS == RHS && + (NumShuffles < 2 || !IsIdentityPostShuffle), + DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, LHS); @@ -44531,10 +44556,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. + SmallVector PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { + SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; + } // NOTE: isHorizontalBinOp may have changed LHS/RHS variables. @@ -47636,17 +47667,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG, bool IsAdd = N->getOpcode() == ISD::ADD; assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); + SmallVector PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) { + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) { auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, - DL, Ops[0].getValueType(), Ops); + return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL, + Ops[0].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, - HOpBuilder); + SDValue HorizBinOp = + SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; } return SDValue(); diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll index f603ace..6abba1b 100644 --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -17,22 +17,46 @@ define float @pr26491(<4 x float> %a0) { ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: pr26491: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: pr26491: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: pr26491: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: pr26491: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSSE3-FAST-NEXT: addss %xmm0, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: pr26491: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: pr26491: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: pr26491: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = fadd <4 x float> %1, %a0 %3 = extractelement <4 x float> %2, i32 2 diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll index 5c8e9a7..4c1dc71 100644 --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -9,30 +9,16 @@ define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { ; SSE-LABEL: hadd_reverse_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshufb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pshufb %xmm2, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pshufb %xmm2, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: paddw %xmm4, %xmm0 +; SSE-NEXT: phaddw %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE-NEXT: retq ; ; AVX-LABEL: hadd_reverse_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX-NEXT: retq %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> @@ -67,67 +53,34 @@ define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; SSE-LABEL: hadd_reverse_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufb %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pshufb %xmm0, %xmm6 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pshufb %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pshufb %xmm0, %xmm7 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; SSE-NEXT: pshufb %xmm0, %xmm2 -; SSE-NEXT: pshufb %xmm0, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: paddw %xmm6, %xmm4 -; SSE-NEXT: pshufb %xmm0, %xmm3 -; SSE-NEXT: pshufb %xmm0, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: paddw %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: phaddw %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: phaddw %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX1-NEXT: vpaddw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,0],ymm3[6,4],ymm2[6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> @@ -209,21 +162,11 @@ define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) noun ; ; AVX2-LABEL: hadd_reverse_v8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,3,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,2,1] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,1] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,0,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-NEXT: vaddpd %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[2,1,0,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vaddpd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm1 +; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vmovapd %ymm3, %ymm0 ; AVX2-NEXT: retq %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> @@ -290,22 +233,14 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nou ; SSE: # %bb.0: ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[3,1] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[3,1] -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm6[3,1] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[3,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,0] -; SSE-NEXT: addps %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: addps %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,0] -; SSE-NEXT: addps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0] -; SSE-NEXT: addps %xmm11, %xmm3 +; SSE-NEXT: haddps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] +; SSE-NEXT: haddps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0,3,2] +; SSE-NEXT: haddps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0,3,2] +; SSE-NEXT: haddps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm2 @@ -316,29 +251,23 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nou ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm0[3,1],ymm4[7,5],ymm0[7,5] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm3[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm5[3,1],ymm3[3,1],ymm5[7,5],ymm3[7,5] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm0[2,0],ymm4[6,4],ymm0[6,4] -; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm1 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm3[2,0],ymm5[6,4],ymm3[6,4] -; AVX1-NEXT: vaddps %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm2[3,1],ymm0[7,5],ymm2[7,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,0,3,1] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm3[3,1],ymm1[7,5],ymm3[7,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,0,3,1] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] -; AVX2-NEXT: vaddps %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] -; AVX2-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] +; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] +; AVX2-NEXT: vmovaps %ymm3, %ymm0 ; AVX2-NEXT: retq %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 7bedbeb..76ef7af 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -879,77 +879,59 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { ; SSSE3_SLOW-LABEL: PR34724_1: ; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSSE3_SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3_SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3_SLOW-NEXT: addps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[1,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: PR34724_1: ; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSSE3_FAST-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3_FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3_FAST-NEXT: addps %xmm0, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSSE3_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: PR34724_1: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_SLOW-NEXT: retq ; ; AVX1_FAST-LABEL: PR34724_1: ; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_1: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_SLOW-NEXT: retq ; ; AVX2_FAST-LABEL: PR34724_1: ; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> @@ -964,78 +946,49 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { ; SSSE3_SLOW-LABEL: PR34724_2: ; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[1,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[2,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: PR34724_2: ; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm3 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSSE3_FAST-NEXT: addps %xmm3, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: PR34724_2: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_SLOW-NEXT: retq ; ; AVX1_FAST-LABEL: PR34724_2: ; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_2: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_SLOW-NEXT: retq ; ; AVX2_FAST-LABEL: PR34724_2: ; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index f8648f5..ae53f2d 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -818,12 +818,25 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-LABEL: PR44694: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: PR44694: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-SLOW-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: PR44694: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-LABEL: PR44694: +; AVX512: # %bb.0: +; AVX512-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> %5 = fadd <4 x double> %3, %4 @@ -831,20 +844,30 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { } define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-LABEL: PR45747_1: -; SSE: # %bb.0: -; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: PR45747_1: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: PR45747_1: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; SSE-FAST-LABEL: PR45747_1: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: PR45747_1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: PR45747_1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %a %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> @@ -852,19 +875,32 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { } define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-LABEL: PR45747_2: -; SSE: # %bb.0: -; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: PR45747_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: PR45747_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: retq +; SSE-FAST-LABEL: PR45747_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE-FAST-NEXT: haddps %xmm1, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: PR45747_2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: PR45747_2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %b %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 5195f5f..1fd6191 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1766,12 +1766,24 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) { } define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: add_v4f64_0246_1357: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: add_v4f64_0246_1357: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4f64_0246_1357: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: add_v4f64_0246_1357: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: retq entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -1780,12 +1792,24 @@ entry: } define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: add_v4f64_4602_5713: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: add_v4f64_4602_5713: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4f64_4602_5713: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: add_v4f64_4602_5713: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX512VL-NEXT: retq entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 4798b4b..3077428 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3039,32 +3039,11 @@ define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm2 -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 -; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -3080,32 +3059,11 @@ define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8f32_8ACE0246_9BDF1357: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8f32_8ACE0246_9BDF1357: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm3 -; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8f32_8ACE0246_9BDF1357: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -3116,45 +3074,21 @@ entry: define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: add_v8i32_02468ACE_13579BDF: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8i32_02468ACE_13579BDF: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VL-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512VL-SLOW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VL-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512VL-FAST-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 -; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8i32_02468ACE_13579BDF: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -3165,45 +3099,21 @@ entry: define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[2],ymm0[2] ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VL-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 -; AVX512VL-SLOW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VL-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 -; AVX512VL-FAST-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3 -; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8i32_8ACE0246_9BDF1357: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>