From e16901844d31d745868403ade45e43d1773c617a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 4 Aug 2019 12:24:40 +0000 Subject: [PATCH] [X86] SimplifyMultipleUseDemandedBits - Add target shuffle support llvm-svn: 367782 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 46 +++++ llvm/test/CodeGen/X86/vec_smulo.ll | 230 ++++++++++++----------- llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll | 39 ++-- 3 files changed, 177 insertions(+), 138 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 634c27b..717abe7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34706,7 +34706,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const { + int NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + switch (Opc) { case X86ISD::PINSRB: case X86ISD::PINSRW: { @@ -34721,6 +34724,49 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( } } + SmallVector ShuffleMask; + SmallVector ShuffleOps; + if (VT.isSimple() && VT.isVector() && + resolveTargetShuffleInputs(Op, ShuffleOps, ShuffleMask, DAG, Depth)) { + // If all the demanded elts are from one operand and are inline, + // then we can use the operand directly. + int NumOps = ShuffleOps.size(); + if (ShuffleMask.size() == NumElts && + llvm::all_of(ShuffleOps, [VT](SDValue V) { + return VT.getSizeInBits() == V.getValueSizeInBits(); + })) { + + // Bitmask that indicates which ops have only been accessed 'inline'. + APInt IdentityOp = APInt::getAllOnesValue(NumOps); + bool AllUndef = true; + + for (int i = 0; i != NumElts; ++i) { + int M = ShuffleMask[i]; + if (SM_SentinelUndef == M || !DemandedElts[i]) + continue; + AllUndef = false; + int Op = M / NumElts; + int Index = M % NumElts; + if (M < 0 || Index != i) { + IdentityOp.clearAllBits(); + break; + } + IdentityOp &= APInt::getOneBitSet(NumOps, Op); + if (IdentityOp == 0) + break; + } + assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) && + "Multiple identity shuffles detected"); + + if (AllUndef) + return DAG.getUNDEF(VT); + + for (int i = 0; i != NumOps; ++i) + if (IdentityOp[i]) + return DAG.getBitcast(VT, ShuffleOps[i]); + } + } + return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index a933fd8..b450446 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -562,143 +562,145 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun ; SSE2-LABEL: smulo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movd %r8d, %xmm9 +; SSE2-NEXT: movd %r8d, %xmm8 ; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE2-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE2-NEXT: movd %r9d, %xmm12 -; SSE2-NEXT: movd {{.*#+}} xmm11 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; SSE2-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movd %r9d, %xmm13 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: movdqa %xmm13, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm7, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm10[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm9[0,0] -; SSE2-NEXT: pmuludq %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: psubd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm7, (%rcx) -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pmuludq %xmm12, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: psubd %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: pmuludq %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm10[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm8[0,0] +; SSE2-NEXT: pmuludq %xmm12, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE2-NEXT: psubd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: pmuludq %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: psubd %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: movq %xmm2, 16(%rcx) -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm7, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movq %xmm3, 16(%rcx) +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movq %xmm3, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: movd %r8d, %xmm9 +; SSSE3-NEXT: movd %r8d, %xmm8 ; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSSE3-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSSE3-NEXT: movd %r9d, %xmm12 -; SSSE3-NEXT: movd {{.*#+}} xmm11 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; SSSE3-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movd %r9d, %xmm13 ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSSE3-NEXT: movdqa %xmm13, %xmm11 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm7, %xmm13 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: pmuludq %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm10[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm9[0,0] -; SSSE3-NEXT: pmuludq %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: psubd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSSE3-NEXT: movdqa %xmm7, (%rcx) -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm0, %xmm7 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm12, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: paddd %xmm1, %xmm4 -; SSSE3-NEXT: pmuludq %xmm12, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; SSSE3-NEXT: pmuludq %xmm8, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: psubd %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: paddd %xmm5, %xmm0 +; SSSE3-NEXT: pmuludq %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm10[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm8[0,0] +; SSSE3-NEXT: pmuludq %xmm12, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSSE3-NEXT: psubd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movdqa %xmm0, (%rcx) +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm11, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: paddd %xmm3, %xmm4 +; SSSE3-NEXT: pmuludq %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSSE3-NEXT: psubd %xmm4, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: movq %xmm2, 16(%rcx) -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm7, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movq %xmm3, 16(%rcx) +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movq %xmm3, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v6i32: diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll index ce73ad5..a5c060e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -1589,9 +1589,8 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1603,8 +1602,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -1616,8 +1614,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax @@ -1650,15 +1647,13 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1670,14 +1665,12 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax @@ -1689,14 +1682,12 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax -- 2.7.4