define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
; SSE2-LABEL: uitofp_2i32_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: subpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_2i32_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_2i32_to_2f64:
; VEX: # %bb.0:
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i32_to_2f64:
define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; SSE2-LABEL: uitofp_4i32_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: subpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_4i32_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_4i32_to_2f64:
define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE2-LABEL: uitofp_4i32_to_4f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE2-NEXT: mulpd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE2-NEXT: mulpd %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE2-NEXT: addpd %xmm5, %xmm1
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: xorpd %xmm2, %xmm2
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm3, %xmm0
+; SSE2-NEXT: subpd %xmm3, %xmm0
+; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: orpd %xmm3, %xmm1
+; SSE2-NEXT: subpd %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_4i32_to_4f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE41-NEXT: mulpd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE41-NEXT: mulpd %xmm2, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE41-NEXT: addpd %xmm5, %xmm1
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm3, %xmm2
+; SSE41-NEXT: subpd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: subpd %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uitofp_4i32_to_4f64:
define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
; SSE2-LABEL: uitofp_load_2i32_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: subpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_2i32_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_load_2i32_to_2f64:
; VEX: # %bb.0:
-; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
; SSE2-LABEL: uitofp_load_4i32_to_2f64_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: subpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_4i32_to_2f64_2:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_load_4i32_to_2f64_2:
define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: subpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE2-LABEL: uitofp_load_4i32_to_4f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE2-NEXT: mulpd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE2-NEXT: mulpd %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE2-NEXT: addpd %xmm5, %xmm1
+; SSE2-NEXT: movapd (%rdi), %xmm1
+; SSE2-NEXT: xorpd %xmm2, %xmm2
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE2-NEXT: orpd %xmm3, %xmm0
+; SSE2-NEXT: subpd %xmm3, %xmm0
+; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: orpd %xmm3, %xmm1
+; SSE2-NEXT: subpd %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_4i32_to_4f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE41-NEXT: mulpd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE41-NEXT: mulpd %xmm2, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE41-NEXT: addpd %xmm5, %xmm1
+; SSE41-NEXT: movdqa (%rdi), %xmm1
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: subpd %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: por %xmm2, %xmm1
+; SSE41-NEXT: subpd %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: uitofp_load_4i32_to_4f64: