From 1ba5c550d418e12a5bdbb884d2f7d94e0efc64ee Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 15:34:39 +0000 Subject: [PATCH] [DAG] Improve folding (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) Extend this to support ComputeNumSignBits of the (used) source vector elements so that we can handle more than just the case where we're sext_in_reg from the source element signbit. Noticed while investigating the poor codegen in D98587. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +- llvm/test/CodeGen/X86/vec_saddo.ll | 34 +- llvm/test/CodeGen/X86/vec_smulo.ll | 650 ++++++++++++-------------- llvm/test/CodeGen/X86/vec_ssubo.ll | 34 +- llvm/test/CodeGen/X86/vec_uaddo.ll | 34 +- llvm/test/CodeGen/X86/vec_umulo.ll | 460 +++++++++--------- llvm/test/CodeGen/X86/vec_usubo.ll | 34 +- llvm/test/CodeGen/X86/vsplit-and.ll | 11 +- 8 files changed, 589 insertions(+), 692 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 120c7f2..1c063da 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11787,14 +11787,22 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { } // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) - if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || - N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || - N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && - N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) { - if (!LegalOperations || - TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) - return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, - N0.getOperand(0)); + // if x is small enough or if we know that x has more than 1 sign bit and the + // sign_extend_inreg is extending from one of them. + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) { + SDValue N00 = N0.getOperand(0); + unsigned N00Bits = N00.getScalarValueSizeInBits(); + unsigned DstElts = N0.getValueType().getVectorMinNumElements(); + unsigned SrcElts = N00.getValueType().getVectorMinNumElements(); + APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts); + if ((N00Bits == ExtVTBits || + (N00Bits - DAG.ComputeNumSignBits(N00, DemandedSrcElts)) < + ExtVTBits) && + (!LegalOperations || + TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))) + return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00); } // fold (sext_in_reg (zext x)) -> (sext x) diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index ede2019..f07b4db 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -539,12 +539,11 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -568,12 +567,11 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -597,9 +595,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -673,10 +669,8 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -692,10 +686,8 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -711,9 +703,7 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 4fa3367..ed2d784 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1292,12 +1292,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm0, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -1344,12 +1343,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm7 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm7, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -1392,9 +1390,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm0, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm6, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -1527,272 +1523,268 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE2-LABEL: smulo_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm9 -; SSE2-NEXT: packuswb %xmm1, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm9, %xmm1 -; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: pcmpgtb %xmm10, %xmm1 +; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmullw %xmm7, %xmm6 +; SSE2-NEXT: pmullw %xmm4, %xmm6 ; SSE2-NEXT: psrlw $8, %xmm6 -; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmullw %xmm4, %xmm5 -; SSE2-NEXT: psrlw $8, %xmm5 -; SSE2-NEXT: packuswb %xmm6, %xmm5 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: pmullw %xmm5, %xmm7 +; SSE2-NEXT: psrlw $8, %xmm7 +; SSE2-NEXT: packuswb %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm7, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm10, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm9, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm11 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm7 -; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: packuswb %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtb %xmm11, %xmm4 +; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: pmullw %xmm5, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm6, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm7, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm9, 16(%rsi) -; SSE2-NEXT: movdqa %xmm2, (%rsi) -; SSE2-NEXT: movdqa %xmm5, 112(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movdqa %xmm10, 16(%rsi) +; SSE2-NEXT: movdqa %xmm11, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 64(%rdi) +; SSE2-NEXT: movdqa %xmm3, (%rdi) +; SSE2-NEXT: movdqa %xmm7, 112(%rdi) ; SSE2-NEXT: movdqa %xmm4, 96(%rdi) -; SSE2-NEXT: movdqa %xmm3, 80(%rdi) -; SSE2-NEXT: movdqa %xmm7, 64(%rdi) -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm6, 32(%rdi) -; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm0, 48(%rdi) +; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm2, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v32i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: movdqa %xmm3, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: pmullw %xmm3, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pmullw %xmm10, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: packuswb %xmm1, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pmullw %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: packuswb %xmm1, %xmm10 ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtb %xmm9, %xmm1 -; SSSE3-NEXT: psraw $8, %xmm7 +; SSSE3-NEXT: pcmpgtb %xmm10, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm4 ; SSSE3-NEXT: psraw $8, %xmm6 -; SSSE3-NEXT: pmullw %xmm7, %xmm6 +; SSSE3-NEXT: pmullw %xmm4, %xmm6 ; SSSE3-NEXT: psrlw $8, %xmm6 -; SSSE3-NEXT: psraw $8, %xmm4 ; SSSE3-NEXT: psraw $8, %xmm5 -; SSSE3-NEXT: pmullw %xmm4, %xmm5 -; SSSE3-NEXT: psrlw $8, %xmm5 -; SSSE3-NEXT: packuswb %xmm6, %xmm5 -; SSSE3-NEXT: pcmpeqb %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSSE3-NEXT: psraw $8, %xmm7 +; SSSE3-NEXT: pmullw %xmm5, %xmm7 +; SSSE3-NEXT: psrlw $8, %xmm7 +; SSSE3-NEXT: packuswb %xmm6, %xmm7 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm2, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm11 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pmullw %xmm7, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pmullw %xmm10, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pmullw %xmm4, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pmullw %xmm9, %xmm11 +; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm11 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtb %xmm2, %xmm7 -; SSSE3-NEXT: psraw $8, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSSE3-NEXT: packuswb %xmm0, %xmm11 +; SSSE3-NEXT: pcmpgtb %xmm11, %xmm4 +; SSSE3-NEXT: psraw $8, %xmm5 ; SSSE3-NEXT: psraw $8, %xmm3 -; SSSE3-NEXT: pmullw %xmm4, %xmm3 +; SSSE3-NEXT: pmullw %xmm5, %xmm3 ; SSSE3-NEXT: psrlw $8, %xmm3 ; SSSE3-NEXT: psraw $8, %xmm6 -; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: pmullw %xmm6, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: pmullw %xmm6, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm5 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm6 ; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm7, %xmm4 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm2, (%rsi) -; SSSE3-NEXT: movdqa %xmm5, 112(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm11, (%rsi) +; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm3, (%rdi) +; SSSE3-NEXT: movdqa %xmm7, 112(%rdi) ; SSSE3-NEXT: movdqa %xmm4, 96(%rdi) -; SSSE3-NEXT: movdqa %xmm3, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) -; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm8, (%rdi) +; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v32i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pmovsxbw %xmm3, %xmm7 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovsxbw %xmm3, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmovsxbw %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: pmullw %xmm10, %xmm9 -; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: packuswb %xmm1, %xmm9 +; SSE41-NEXT: pmullw %xmm9, %xmm11 +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: packuswb %xmm1, %xmm11 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm9, %xmm1 -; SSE41-NEXT: pmullw %xmm7, %xmm4 +; SSE41-NEXT: pcmpgtb %xmm11, %xmm1 +; SSE41-NEXT: pmullw %xmm10, %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: pmovsxbw %xmm5, %xmm3 -; SSE41-NEXT: pmovsxbw %xmm6, %xmm5 +; SSE41-NEXT: pmovsxbw %xmm7, %xmm5 ; SSE41-NEXT: pmullw %xmm3, %xmm5 ; SSE41-NEXT: psrlw $8, %xmm5 ; SSE41-NEXT: packuswb %xmm5, %xmm4 ; SSE41-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmovsxbw %xmm2, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm0 -; SSE41-NEXT: pmullw %xmm10, %xmm1 -; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pmullw %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 ; SSE41-NEXT: pxor %xmm8, %xmm4 ; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm7 -; SSE41-NEXT: pmullw %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm10, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pmovsxbw %xmm6, %xmm0 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm7, %xmm2 +; SSE41-NEXT: pmovsxbw %xmm7, %xmm0 +; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 +; SSE41-NEXT: pmullw %xmm0, %xmm5 +; SSE41-NEXT: psrlw $8, %xmm5 +; SSE41-NEXT: packuswb %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm8, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1803,35 +1795,31 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm7 -; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm7 +; SSE41-NEXT: psrad $31, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm6 +; SSE41-NEXT: psrad $31, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm9, 16(%rsi) +; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 +; SSE41-NEXT: movdqa %xmm11, 16(%rsi) ; SSE41-NEXT: movdqa %xmm1, (%rsi) ; SSE41-NEXT: movdqa %xmm4, 64(%rdi) ; SSE41-NEXT: movdqa %xmm2, (%rdi) ; SSE41-NEXT: movdqa %xmm0, 112(%rdi) -; SSE41-NEXT: movdqa %xmm3, 96(%rdi) +; SSE41-NEXT: movdqa %xmm6, 96(%rdi) ; SSE41-NEXT: movdqa %xmm7, 80(%rdi) -; SSE41-NEXT: movdqa %xmm6, 48(%rdi) +; SSE41-NEXT: movdqa %xmm3, 48(%rdi) ; SSE41-NEXT: movdqa %xmm5, 32(%rdi) ; SSE41-NEXT: movdqa %xmm8, 16(%rdi) ; SSE41-NEXT: retq @@ -2151,86 +2139,82 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE2-NEXT: movdqa %xmm11, 32(%rsi) ; SSE2-NEXT: movdqa %xmm6, 16(%rsi) ; SSE2-NEXT: movdqa %xmm12, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm5, (%rsi) -; SSE2-NEXT: movdqa %xmm12, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm12 -; SSE2-NEXT: psrad $31, %xmm12 -; SSE2-NEXT: movdqa %xmm12, 224(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 192(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 128(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 64(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 224(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, 240(%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm12 +; SSE2-NEXT: psrad $31, %xmm12 +; SSE2-NEXT: movdqa %xmm12, 208(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 192(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 208(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 160(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 160(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, 176(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 128(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 144(%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 96(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 112(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 64(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 80(%rdi) -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 @@ -2366,86 +2350,82 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSSE3-NEXT: movdqa %xmm11, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm6, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm12, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: movdqa %xmm5, (%rsi) -; SSSE3-NEXT: movdqa %xmm12, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm12 -; SSSE3-NEXT: psrad $31, %xmm12 -; SSSE3-NEXT: movdqa %xmm12, 224(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 128(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 64(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 224(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm12 +; SSSE3-NEXT: psrad $31, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, 208(%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 208(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 160(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 128(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 96(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 32(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 @@ -2455,32 +2435,32 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-LABEL: smulo_v64i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero ; SSE41-NEXT: movdqa %xmm7, %xmm10 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: movdqa %xmm3, %xmm11 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm10, %xmm11 ; SSE41-NEXT: pmovsxbw %xmm7, %xmm12 -; SSE41-NEXT: pmullw %xmm8, %xmm9 +; SSE41-NEXT: pmullw %xmm9, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm10, %xmm11 -; SSE41-NEXT: pand %xmm10, %xmm9 -; SSE41-NEXT: packuswb %xmm11, %xmm9 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm8 +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: packuswb %xmm11, %xmm8 +; SSE41-NEXT: pmovsxbw %xmm3, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3] -; SSE41-NEXT: pmullw %xmm12, %xmm8 +; SSE41-NEXT: pmullw %xmm12, %xmm9 ; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtb %xmm9, %xmm7 -; SSE41-NEXT: psrlw $8, %xmm8 +; SSE41-NEXT: pcmpgtb %xmm8, %xmm7 +; SSE41-NEXT: psrlw $8, %xmm9 ; SSE41-NEXT: pmovsxbw %xmm11, %xmm11 ; SSE41-NEXT: pmovsxbw %xmm13, %xmm3 ; SSE41-NEXT: pmullw %xmm11, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm8 -; SSE41-NEXT: pcmpeqb %xmm7, %xmm8 +; SSE41-NEXT: packuswb %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqb %xmm7, %xmm9 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] @@ -2557,41 +2537,33 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pcmpgtb %xmm5, %xmm0 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm8 +; SSE41-NEXT: pxor %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm9, 48(%rsi) +; SSE41-NEXT: movdqa %xmm8, 48(%rsi) ; SSE41-NEXT: movdqa %xmm12, 32(%rsi) ; SSE41-NEXT: movdqa %xmm11, 16(%rsi) ; SSE41-NEXT: movdqa %xmm5, (%rsi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 128(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 240(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -3018,10 +2990,8 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -3037,10 +3007,8 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -3056,9 +3024,7 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 20fca27..b591f3b 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -544,12 +544,11 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -573,12 +572,11 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -602,9 +600,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -678,10 +674,8 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -697,10 +691,8 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -716,9 +708,7 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 7cf566f..836cfaf 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -626,12 +626,11 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 @@ -654,12 +653,11 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 @@ -682,9 +680,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 @@ -757,10 +753,8 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -776,10 +770,8 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pxor %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 @@ -794,9 +786,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm2, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 5d29e20..9075b83 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1104,12 +1104,11 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -1155,12 +1154,11 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -1199,9 +1197,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm5, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -1335,11 +1331,11 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm6, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: packuswb %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm6, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: packuswb %xmm7, %xmm11 ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1369,49 +1365,47 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm10, 16(%rsi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE2-NEXT: psrad $24, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm11, 16(%rsi) ; SSE2-NEXT: movdqa %xmm9, (%rsi) +; SSE2-NEXT: movdqa %xmm5, 64(%rdi) +; SSE2-NEXT: movdqa %xmm7, (%rdi) ; SSE2-NEXT: movdqa %xmm1, 112(%rdi) -; SSE2-NEXT: movdqa %xmm5, 96(%rdi) -; SSE2-NEXT: movdqa %xmm4, 80(%rdi) -; SSE2-NEXT: movdqa %xmm7, 64(%rdi) -; SSE2-NEXT: movdqa %xmm0, 48(%rdi) -; SSE2-NEXT: movdqa %xmm6, 32(%rdi) -; SSE2-NEXT: movdqa %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm3, (%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, 48(%rdi) +; SSE2-NEXT: movdqa %xmm4, 32(%rdi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v32i8: @@ -1439,11 +1433,11 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pmullw %xmm6, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: packuswb %xmm7, %xmm10 +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pmullw %xmm6, %xmm11 +; SSSE3-NEXT: pand %xmm8, %xmm11 +; SSSE3-NEXT: packuswb %xmm7, %xmm11 ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1473,49 +1467,47 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm6, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, 16(%rsi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSSE3-NEXT: psrad $24, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm11, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm9, (%rsi) +; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm7, (%rdi) ; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm5, 96(%rdi) -; SSSE3-NEXT: movdqa %xmm4, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) -; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm3, (%rdi) +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v32i8: @@ -1526,42 +1518,42 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm4, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm9, %xmm7 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm10, %xmm7 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmullw %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pand %xmm9, %xmm8 +; SSE41-NEXT: pand %xmm10, %xmm8 ; SSE41-NEXT: packuswb %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm3, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm7, %xmm5 -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: pmullw %xmm4, %xmm7 -; SSE41-NEXT: pand %xmm7, %xmm9 -; SSE41-NEXT: packuswb %xmm5, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE41-NEXT: pmullw %xmm9, %xmm7 +; SSE41-NEXT: pand %xmm7, %xmm10 +; SSE41-NEXT: packuswb %xmm5, %xmm10 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm7 ; SSE41-NEXT: packuswb %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm1, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm6 ; SSE41-NEXT: packuswb %xmm0, %xmm6 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm6 ; SSE41-NEXT: pxor %xmm1, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1579,26 +1571,22 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm7 -; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: movdqa %xmm9, 16(%rsi) +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pmovsxbd %xmm6, %xmm6 +; SSE41-NEXT: pmovsxbd %xmm7, %xmm7 +; SSE41-NEXT: movdqa %xmm10, 16(%rsi) ; SSE41-NEXT: movdqa %xmm8, (%rsi) ; SSE41-NEXT: movdqa %xmm7, 64(%rdi) ; SSE41-NEXT: movdqa %xmm6, (%rdi) -; SSE41-NEXT: movdqa %xmm5, 112(%rdi) -; SSE41-NEXT: movdqa %xmm4, 96(%rdi) +; SSE41-NEXT: movdqa %xmm4, 112(%rdi) +; SSE41-NEXT: movdqa %xmm5, 96(%rdi) ; SSE41-NEXT: movdqa %xmm3, 80(%rdi) ; SSE41-NEXT: movdqa %xmm2, 48(%rdi) ; SSE41-NEXT: movdqa %xmm1, 32(%rdi) @@ -1872,86 +1860,82 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE2-NEXT: movdqa %xmm10, 32(%rsi) ; SSE2-NEXT: movdqa %xmm9, 16(%rsi) ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm8, (%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 224(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 192(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 128(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 64(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 224(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: movdqa %xmm5, 240(%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 192(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, 208(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 160(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 160(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, 176(%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 128(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 144(%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 96(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 112(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 64(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 80(%rdi) -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 @@ -2072,86 +2056,82 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSSE3-NEXT: movdqa %xmm10, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm9, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: movdqa %xmm8, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 224(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 192(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 128(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 160(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 128(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 96(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 32(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 @@ -2200,27 +2180,27 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmullw %xmm14, %xmm12 -; SSE41-NEXT: movdqa %xmm12, %xmm15 -; SSE41-NEXT: pand %xmm10, %xmm15 -; SSE41-NEXT: packuswb %xmm0, %xmm15 +; SSE41-NEXT: movdqa %xmm12, %xmm14 +; SSE41-NEXT: pand %xmm10, %xmm14 +; SSE41-NEXT: packuswb %xmm0, %xmm14 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm0, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: pmullw %xmm0, %xmm14 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm15 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: pmullw %xmm0, %xmm15 ; SSE41-NEXT: pand %xmm10, %xmm3 -; SSE41-NEXT: pand %xmm14, %xmm10 +; SSE41-NEXT: pand %xmm15, %xmm10 ; SSE41-NEXT: packuswb %xmm3, %xmm10 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; SSE41-NEXT: pmullw %xmm7, %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm14 -; SSE41-NEXT: packuswb %xmm4, %xmm14 +; SSE41-NEXT: psrlw $8, %xmm15 +; SSE41-NEXT: packuswb %xmm4, %xmm15 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE41-NEXT: pmullw %xmm6, %xmm2 @@ -2239,48 +2219,40 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: psrlw $8, %xmm5 ; SSE41-NEXT: psrlw $8, %xmm8 ; SSE41-NEXT: packuswb %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqb %xmm0, %xmm14 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm15 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm12 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm9 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm14 +; SSE41-NEXT: pxor %xmm0, %xmm15 ; SSE41-NEXT: pxor %xmm0, %xmm12 ; SSE41-NEXT: pxor %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm10, 48(%rsi) -; SSE41-NEXT: movdqa %xmm15, 32(%rsi) +; SSE41-NEXT: movdqa %xmm14, 32(%rsi) ; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE41-NEXT: movaps %xmm0, 16(%rsi) ; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE41-NEXT: movaps %xmm0, (%rsi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm15, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm12, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 128(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 240(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2655,10 +2627,8 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm0, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -2674,10 +2644,8 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -2693,9 +2661,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index dde6832..4e2906c 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -670,12 +670,11 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -698,12 +697,11 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -726,9 +724,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -800,10 +796,8 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: psubw %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -820,10 +814,8 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: psubw %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: psrad $16, %xmm1 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 @@ -840,9 +832,7 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll index aa043ed..e1a99d2 100644 --- a/llvm/test/CodeGen/X86/vsplit-and.ll +++ b/llvm/test/CodeGen/X86/vsplit-and.ll @@ -41,16 +41,13 @@ define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind read ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; CHECK-NEXT: xorps %xmm0, %xmm1 ; CHECK-NEXT: andnps %xmm1, %xmm2 -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; CHECK-NEXT: psllq $63, %xmm0 ; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; CHECK-NEXT: psllq $63, %xmm1 -; CHECK-NEXT: psrad $31, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: movq %xmm1, 16(%rdi) -; CHECK-NEXT: movdqa %xmm0, (%rdi) +; CHECK-NEXT: pmovsxdq %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rdi) +; CHECK-NEXT: movq %xmm0, 16(%rdi) ; CHECK-NEXT: retq %cmp1 = icmp ne <3 x i64> %src1, zeroinitializer %cmp2 = icmp ne <3 x i64> %src2, zeroinitializer -- 2.7.4