For bitcast nodes from larger element types, add the ability for SimplifyDemandedVectorElts to call SimplifyDemandedBits by merging the elts mask to a bits mask.
I've raised https://bugs.llvm.org/show_bug.cgi?id=39689 to deal with the few places where SimplifyDemandedBits's lack of vector handling is a problem.
Differential Revision: https://reviews.llvm.org/D54679
llvm-svn: 347301
TLO, Depth + 1))
return true;
+ // Try calling SimplifyDemandedBits, converting demanded elts to the bits
+ // of the large element.
+ // TODO - bigendian once we have test coverage.
+ if (TLO.DAG.getDataLayout().isLittleEndian()) {
+ unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
+ APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits);
+ for (unsigned i = 0; i != NumElts; ++i)
+ if (DemandedElts[i]) {
+ unsigned Ofs = (i % Scale) * EltSizeInBits;
+ SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits);
+ }
+
+ KnownBits Known;
+ if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1))
+ return true;
+ }
+
// If the src element is zero/undef then all the output elements will be -
// only demanded elements are guaranteed to be correct.
for (unsigned i = 0; i != NumSrcElts; ++i) {
define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_or_shuffle_uitofp:
; X32: # %bb.0:
-; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
%2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
ret <4 x double> %3
}
+; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps
define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
; X32: # %bb.0:
;
; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: vcvtdq2pd %xmm0, %xmm0
; X64-NEXT: retq
%1 = ashr <2 x i64> %a0, <i64 16, i64 16>
; X32-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
-; X32-NEXT: sarl $31, %eax
-; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-NEXT: vpsllq $20, %xmm1, %xmm1
-; X32-NEXT: vpsrad $20, %xmm1, %xmm2
-; X32-NEXT: vpsrlq $20, %xmm1, %xmm1
-; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
;
; AVX2-LABEL: trunc_shl_17_v8i16_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $17, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
%conv = trunc <8 x i32> %shl to <8 x i16>
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
-; X86-SSE-NEXT: psllq $32, %xmm2
-; X86-SSE-NEXT: paddq %xmm1, %xmm2
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT: psllq $32, %xmm2
-; X64-SSE-NEXT: paddq %xmm1, %xmm2
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,u,65536,u>
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE-NEXT: psrlq $32, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
-; X86-SSE-NEXT: paddq %xmm1, %xmm3
-; X86-SSE-NEXT: psllq $32, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT: paddq %xmm3, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
-; X64-SSE-NEXT: movq %rcx, %xmm2
-; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT: psllq $32, %xmm2
-; X64-SSE-NEXT: paddq %xmm0, %xmm2
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X64-SSE-NEXT: movq %rcx, %xmm1
+; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,u,32768,u>
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE-NEXT: movdqa %xmm1, %xmm3
-; X86-SSE-NEXT: psrlq $32, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
-; X86-SSE-NEXT: paddq %xmm2, %xmm3
-; X86-SSE-NEXT: psllq $32, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: paddq %xmm3, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-SSE-NEXT: movq %rcx, %xmm1
; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT: psllq $32, %xmm2
-; X64-SSE-NEXT: paddq %xmm0, %xmm2
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v4i64_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: paddq %xmm4, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v16i64_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: movdqa %xmm0, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm0, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm0
-; SSE-NEXT: paddq %xmm10, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm1, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm1
-; SSE-NEXT: paddq %xmm10, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm2, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm2
-; SSE-NEXT: paddq %xmm10, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm3, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm3
-; SSE-NEXT: paddq %xmm10, %xmm3
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm4, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm4
-; SSE-NEXT: paddq %xmm10, %xmm4
-; SSE-NEXT: movdqa %xmm5, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm5, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm5
-; SSE-NEXT: paddq %xmm10, %xmm5
-; SSE-NEXT: movdqa %xmm6, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm6, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm6
-; SSE-NEXT: paddq %xmm10, %xmm6
-; SSE-NEXT: movdqa %xmm7, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm7, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm7
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: paddq %xmm10, %xmm7
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm8, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm9, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm8, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm9, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
-; SSE-NEXT: movdqa %xmm2, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm8, %xmm2
-; SSE-NEXT: psllq $32, %xmm2
-; SSE-NEXT: paddq %xmm9, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm8, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: paddq %xmm9, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm8, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: paddq %xmm9, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
-; SSE-NEXT: movdqa %xmm5, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm8, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: paddq %xmm9, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
-; SSE-NEXT: movdqa %xmm6, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm8, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: paddq %xmm9, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
-; SSE-NEXT: movdqa %xmm7, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm7
-; SSE-NEXT: pmuludq %xmm8, %xmm7
-; SSE-NEXT: psllq $32, %xmm7
-; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_self_v4i64_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm8, %xmm8
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm1, %xmm7
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm0, %xmm5
-; SSE-NEXT: paddq %xmm7, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm2, %xmm6
-; SSE-NEXT: paddq %xmm3, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: paddq %xmm6, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE-NEXT: paddd %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_self_v4i64_v4i32:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pmuludq %xmm5, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: paddq %xmm2, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
-; SSE-NEXT: paddd %xmm5, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
+; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v4i64_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: paddq %xmm4, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v16i64_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: movdqa %xmm0, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm0, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm0
-; SSE-NEXT: paddq %xmm10, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm1, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm1
-; SSE-NEXT: paddq %xmm10, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm2, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm2
-; SSE-NEXT: paddq %xmm10, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm3, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm3
-; SSE-NEXT: paddq %xmm10, %xmm3
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm4, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm4
-; SSE-NEXT: paddq %xmm10, %xmm4
-; SSE-NEXT: movdqa %xmm5, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm5, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm5
-; SSE-NEXT: paddq %xmm10, %xmm5
-; SSE-NEXT: movdqa %xmm6, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm6, %xmm10
-; SSE-NEXT: paddq %xmm9, %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm8, %xmm6
-; SSE-NEXT: paddq %xmm10, %xmm6
-; SSE-NEXT: movdqa %xmm7, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm9, %xmm8
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: psrlq $32, %xmm10
-; SSE-NEXT: pmuludq %xmm7, %xmm10
-; SSE-NEXT: paddq %xmm8, %xmm10
-; SSE-NEXT: pmuludq %xmm9, %xmm7
-; SSE-NEXT: psllq $32, %xmm10
-; SSE-NEXT: paddq %xmm10, %xmm7
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm8, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm9, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm8, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm9, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
-; SSE-NEXT: movdqa %xmm2, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm8, %xmm2
-; SSE-NEXT: psllq $32, %xmm2
-; SSE-NEXT: paddq %xmm9, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm8, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: paddq %xmm9, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm8, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: paddq %xmm9, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
-; SSE-NEXT: movdqa %xmm5, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm8, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: paddq %xmm9, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
-; SSE-NEXT: movdqa %xmm6, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm8, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: paddq %xmm9, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
-; SSE-NEXT: movdqa %xmm7, %xmm9
-; SSE-NEXT: pmuludq %xmm8, %xmm9
-; SSE-NEXT: psrlq $32, %xmm7
-; SSE-NEXT: pmuludq %xmm8, %xmm7
-; SSE-NEXT: psllq $32, %xmm7
-; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
+; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_self_v4i64_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm8, %xmm8
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm1, %xmm7
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm0, %xmm5
-; SSE-NEXT: paddq %xmm7, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
-; SSE-NEXT: pmuludq %xmm2, %xmm6
-; SSE-NEXT: paddq %xmm3, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: paddq %xmm6, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE-NEXT: paddd %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_self_v4i64_v4i32:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pmuludq %xmm5, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: paddq %xmm2, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
-; SSE-NEXT: paddd %xmm5, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
+; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: