; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm4
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT: movd %xmm1, %ecx
; X86-SSE-NEXT: movl %esi, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT: divl 16(%esi)
+; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm3
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
; X86-SSE-NEXT: movd %xmm2, %eax
; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X86-SSE-NEXT: movd %xmm0, %ecx
; X86-SSE-NEXT: movl %edi, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl (%esi)
+; X86-SSE-NEXT: divl %ecx
; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-SSE-NEXT: movd %xmm2, %ecx
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 32(%esi)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: divl 32(%ecx)
; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vmovd %xmm2, %eax
+; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT: vmovd %xmm1, %eax
; X86-AVX1-NEXT: xorl %edx, %edx
; X86-AVX1-NEXT: divl 32(%ecx)
; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax
+; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1
+; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3
+; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 28(%ecx)
+; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT: vpextrd $2, %xmm1, %eax
+; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax
+; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 24(%ecx)
+; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 20(%ecx)
+; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT: vmovd %xmm1, %eax
+; X86-AVX1-NEXT: vmovd %xmm2, %eax
+; X86-AVX1-NEXT: vmovd %xmm3, %ecx
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 16(%ecx)
+; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, %ebp
; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 12(%ecx)
+; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: movl %edx, %ebx
; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 8(%ecx)
+; X86-AVX1-NEXT: divl %esi
; X86-AVX1-NEXT: movl %edx, %esi
; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 4(%ecx)
+; X86-AVX1-NEXT: divl %edi
; X86-AVX1-NEXT: movl %edx, %edi
; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vmovd %xmm1, %ecx
; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl (%ecx)
+; X86-AVX1-NEXT: divl %ecx
; X86-AVX1-NEXT: vmovd %edx, %xmm0
; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
;
; X86-AVX2-LABEL: PR34947:
; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: pushl %edi
; X86-AVX2-NEXT: pushl %esi
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; X86-AVX2-NEXT: vpextrd $1, %xmm2, %eax
+; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2
+; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3
+; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
+; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 20(%esi)
+; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: movl %edx, %ecx
-; X86-AVX2-NEXT: vmovd %xmm2, %eax
+; X86-AVX2-NEXT: vmovd %xmm3, %edi
+; X86-AVX2-NEXT: vmovd %xmm4, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 16(%esi)
-; X86-AVX2-NEXT: vmovd %edx, %xmm3
-; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
-; X86-AVX2-NEXT: vpextrd $2, %xmm2, %eax
+; X86-AVX2-NEXT: divl %edi
+; X86-AVX2-NEXT: vmovd %edx, %xmm5
+; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
+; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
+; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 24(%esi)
-; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
-; X86-AVX2-NEXT: vpextrd $3, %xmm2, %eax
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
+; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 28(%esi)
-; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
+; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 4(%esi)
+; X86-AVX2-NEXT: divl %ecx
; X86-AVX2-NEXT: movl %edx, %ecx
+; X86-AVX2-NEXT: vmovd %xmm2, %edi
; X86-AVX2-NEXT: vmovd %xmm1, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl (%esi)
-; X86-AVX2-NEXT: vmovd %edx, %xmm3
-; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
+; X86-AVX2-NEXT: divl %edi
+; X86-AVX2-NEXT: vmovd %edx, %xmm4
+; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
+; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 8(%esi)
-; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 12(%esi)
-; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1
-; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX2-NEXT: divl %ecx
+; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
+; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: xorl %edx, %edx
; X86-AVX2-NEXT: divl 32(%esi)
; X86-AVX2-NEXT: movl %eax, (%eax)
; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
; X86-AVX2-NEXT: popl %esi
+; X86-AVX2-NEXT: popl %edi
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
; X64-SSE-NEXT: divl %edi
; X64-SSE-NEXT: movd %edx, %xmm4
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X64-SSE-NEXT: movd %xmm1, %edi
; X64-SSE-NEXT: movl %r9d, %eax
; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl 16(%rsi)
+; X64-SSE-NEXT: divl %edi
; X64-SSE-NEXT: movd %edx, %xmm3
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
; X64-SSE-NEXT: movd %xmm2, %eax
; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X64-SSE-NEXT: movd %xmm0, %edi
; X64-SSE-NEXT: movl %r10d, %eax
; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl (%rsi)
+; X64-SSE-NEXT: divl %edi
; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X64-SSE-NEXT: movd %xmm2, %edi
; X64-AVX1-NEXT: pushq %rbp
; X64-AVX1-NEXT: pushq %rbx
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vmovd %xmm2, %eax
+; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT: vmovd %xmm1, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 32(%rsi)
; X64-AVX1-NEXT: movl %edx, %r8d
-; X64-AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax
+; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1
+; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3
+; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 28(%rsi)
+; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %r9d
-; X64-AVX1-NEXT: vpextrd $2, %xmm1, %eax
+; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax
+; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 24(%rsi)
+; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %r10d
-; X64-AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax
+; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 20(%rsi)
+; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %r11d
-; X64-AVX1-NEXT: vmovd %xmm1, %eax
+; X64-AVX1-NEXT: vmovd %xmm2, %eax
+; X64-AVX1-NEXT: vmovd %xmm3, %ecx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 16(%rsi)
-; X64-AVX1-NEXT: movl %edx, %ecx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %esi
; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 12(%rsi)
+; X64-AVX1-NEXT: divl %ecx
; X64-AVX1-NEXT: movl %edx, %edi
; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 8(%rsi)
-; X64-AVX1-NEXT: movl %edx, %ebx
+; X64-AVX1-NEXT: divl %ecx
+; X64-AVX1-NEXT: movl %edx, %ecx
; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 4(%rsi)
-; X64-AVX1-NEXT: movl %edx, %ebp
+; X64-AVX1-NEXT: divl %ebx
+; X64-AVX1-NEXT: movl %edx, %ebx
; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vmovd %xmm1, %ebp
; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl (%rsi)
+; X64-AVX1-NEXT: divl %ebp
; X64-AVX1-NEXT: vmovd %edx, %xmm0
-; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovd %ecx, %xmm2
+; X64-AVX1-NEXT: vmovd %esi, %xmm2
; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpextrd $1, %xmm2, %eax
+; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2
+; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3
+; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
+; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 20(%rsi)
+; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: movl %edx, %ecx
-; X64-AVX2-NEXT: vmovd %xmm2, %eax
+; X64-AVX2-NEXT: vmovd %xmm3, %edi
+; X64-AVX2-NEXT: vmovd %xmm4, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 16(%rsi)
-; X64-AVX2-NEXT: vmovd %edx, %xmm3
-; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
-; X64-AVX2-NEXT: vpextrd $2, %xmm2, %eax
+; X64-AVX2-NEXT: divl %edi
+; X64-AVX2-NEXT: vmovd %edx, %xmm5
+; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
+; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
+; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 24(%rsi)
-; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
-; X64-AVX2-NEXT: vpextrd $3, %xmm2, %eax
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
+; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 28(%rsi)
-; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
+; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 4(%rsi)
+; X64-AVX2-NEXT: divl %ecx
; X64-AVX2-NEXT: movl %edx, %ecx
+; X64-AVX2-NEXT: vmovd %xmm2, %edi
; X64-AVX2-NEXT: vmovd %xmm1, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl (%rsi)
-; X64-AVX2-NEXT: vmovd %edx, %xmm3
-; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
+; X64-AVX2-NEXT: divl %edi
+; X64-AVX2-NEXT: vmovd %edx, %xmm4
+; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
+; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 8(%rsi)
-; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 12(%rsi)
-; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1
-; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: divl %ecx
+; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
+; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: xorl %edx, %edx
; X64-AVX2-NEXT: divl 32(%rsi)
; SSE2-LABEL: sitofp_load_2i64_to_2f64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: cvtsi2sd %rax, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
;
; SSE41-LABEL: sitofp_load_2i64_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT: cvtsi2sdq (%rdi), %xmm0
+; SSE41-NEXT: movdqa (%rdi), %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2sd %rax, %xmm1
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: cvtsi2sd %rax, %xmm0
; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; VEX-LABEL: sitofp_load_2i64_to_2f64:
; VEX: # %bb.0:
-; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1
-; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; VEX-NEXT: vmovdqa (%rdi), %xmm0
+; VEX-NEXT: vpextrq $1, %xmm0, %rax
+; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
+; VEX-NEXT: vmovq %xmm0, %rax
+; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
+; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: cvtsi2sd %rax, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2sd %rax, %xmm1
; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2sdq 16(%rdi), %xmm1
+; SSE2-NEXT: cvtsi2sd %rax, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm2, %xmm2
;
; SSE41-LABEL: sitofp_load_4i64_to_4f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT: cvtsi2sdq (%rdi), %xmm0
-; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: cvtsi2sdq 24(%rdi), %xmm2
+; SSE41-NEXT: movdqa (%rdi), %xmm0
+; SSE41-NEXT: movdqa 16(%rdi), %xmm1
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2sd %rax, %xmm2
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: cvtsi2sd %rax, %xmm0
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: xorps %xmm2, %xmm2
+; SSE41-NEXT: cvtsi2sd %rax, %xmm2
+; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2sdq 16(%rdi), %xmm1
+; SSE41-NEXT: cvtsi2sd %rax, %xmm1
; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE41-NEXT: retq
;
; VEX-LABEL: sitofp_load_4i64_to_4f64:
; VEX: # %bb.0:
-; VEX-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; VEX-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
-; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2
-; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; VEX-NEXT: vmovapd (%rdi), %xmm0
+; VEX-NEXT: vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT: vpextrq $1, %xmm1, %rax
+; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
+; VEX-NEXT: vmovq %xmm1, %rax
+; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
+; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; VEX-NEXT: vpextrq $1, %xmm0, %rax
+; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
+; VEX-NEXT: vmovq %xmm0, %rax
+; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
+; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovapd (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vmovapd (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm2
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: cvtsi2ss %rax, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0
+; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
;
; SSE41-LABEL: sitofp_load_4i64_to_4f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: cvtsi2ssq 8(%rdi), %xmm1
-; SSE41-NEXT: cvtsi2ssq (%rdi), %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT: movdqa (%rdi), %xmm0
+; SSE41-NEXT: movdqa 16(%rdi), %xmm1
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm2
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: cvtsi2ss %rax, %xmm0
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE41-NEXT: movq %xmm1, %rax
+; SSE41-NEXT: xorps %xmm2, %xmm2
+; SSE41-NEXT: cvtsi2ss %rax, %xmm2
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm1
+; SSE41-NEXT: cvtsi2ss %rax, %xmm1
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; SSE41-NEXT: retq
;
; VEX-LABEL: sitofp_load_4i64_to_4f32:
; VEX: # %bb.0:
-; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; VEX-NEXT: vmovdqa (%rdi), %xmm0
+; VEX-NEXT: vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT: vpextrq $1, %xmm0, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
+; VEX-NEXT: vmovq %xmm0, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
+; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; VEX-NEXT: vmovq %xmm1, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
+; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; VEX-NEXT: vpextrq $1, %xmm1, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512VL-NEXT: retq
;
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
; SSE2-NEXT: movdqa 48(%rdi), %xmm3
-; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm4
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0
+; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: xorps %xmm4, %xmm4
-; SSE2-NEXT: cvtsi2ssq 48(%rdi), %xmm4
+; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ssq 32(%rdi), %xmm1
+; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorps %xmm2, %xmm2
;
; SSE41-LABEL: sitofp_load_8i64_to_8f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: cvtsi2ssq 8(%rdi), %xmm1
-; SSE41-NEXT: cvtsi2ssq (%rdi), %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT: movdqa (%rdi), %xmm0
+; SSE41-NEXT: movdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movdqa 48(%rdi), %xmm3
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm4
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: cvtsi2ss %rax, %xmm0
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; SSE41-NEXT: movq %xmm1, %rax
+; SSE41-NEXT: xorps %xmm4, %xmm4
+; SSE41-NEXT: cvtsi2ss %rax, %xmm4
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3]
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm1
+; SSE41-NEXT: cvtsi2ss %rax, %xmm1
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: cvtsi2ssq 40(%rdi), %xmm2
+; SSE41-NEXT: pextrq $1, %xmm2, %rax
+; SSE41-NEXT: xorps %xmm4, %xmm4
+; SSE41-NEXT: cvtsi2ss %rax, %xmm4
+; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ssq 32(%rdi), %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; SSE41-NEXT: cvtsi2ss %rax, %xmm1
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
+; SSE41-NEXT: movq %xmm3, %rax
; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ssq 48(%rdi), %xmm2
+; SSE41-NEXT: cvtsi2ss %rax, %xmm2
; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT: pextrq $1, %xmm3, %rax
; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ssq 56(%rdi), %xmm2
+; SSE41-NEXT: cvtsi2ss %rax, %xmm2
; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
; SSE41-NEXT: retq
;
; VEX-LABEL: sitofp_load_8i64_to_8f32:
; VEX: # %bb.0:
-; VEX-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; VEX-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; VEX-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; VEX-NEXT: vmovaps (%rdi), %xmm0
+; VEX-NEXT: vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT: vmovdqa 32(%rdi), %xmm2
+; VEX-NEXT: vmovdqa 48(%rdi), %xmm3
+; VEX-NEXT: vpextrq $1, %xmm2, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
+; VEX-NEXT: vmovq %xmm2, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
+; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; VEX-NEXT: vmovq %xmm3, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
+; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; VEX-NEXT: vpextrq $1, %xmm3, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; VEX-NEXT: vpextrq $1, %xmm0, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; VEX-NEXT: vmovq %xmm0, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
+; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; VEX-NEXT: vmovq %xmm1, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; VEX-NEXT: vpextrq $1, %xmm1, %rax
+; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT: vmovaps (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX512F-NEXT: vmovq %xmm2, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT: vmovq %xmm3, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovq %xmm2, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT: vmovq %xmm3, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
; SSE2-LABEL: uitofp_load_4i64_to_4f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movq 16(%rdi), %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB83_1
; SSE2-NEXT: # %bb.2:
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB83_3:
-; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB83_4
; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: jmp .LBB83_6
; SSE2-NEXT: .LBB83_4:
-; SSE2-NEXT: movq %rcx, %rdx
-; SSE2-NEXT: shrq %rdx
-; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm2
-; SSE2-NEXT: addss %xmm2, %xmm2
+; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: andl $1, %eax
+; SSE2-NEXT: orq %rcx, %rax
+; SSE2-NEXT: cvtsi2ss %rax, %xmm3
+; SSE2-NEXT: addss %xmm3, %xmm3
; SSE2-NEXT: .LBB83_6:
-; SSE2-NEXT: movdqa (%rdi), %xmm3
+; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB83_7
; SSE2-NEXT: # %bb.8:
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB83_9:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB83_10
;
; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512VL-NEXT: retq
;
; SSE2-LABEL: uitofp_load_8i64_to_8f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movq 16(%rdi), %rax
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_1
; SSE2-NEXT: # %bb.2:
; SSE2-NEXT: cvtsi2ss %rax, %xmm2
; SSE2-NEXT: addss %xmm2, %xmm2
; SSE2-NEXT: .LBB87_3:
-; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_4
; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: jmp .LBB87_6
; SSE2-NEXT: .LBB87_4:
-; SSE2-NEXT: movq %rcx, %rdx
-; SSE2-NEXT: shrq %rdx
-; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: andl $1, %eax
+; SSE2-NEXT: orq %rcx, %rax
+; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB87_6:
-; SSE2-NEXT: movdqa (%rdi), %xmm3
+; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_7
; SSE2-NEXT: # %bb.8:
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB87_9:
-; SSE2-NEXT: movq 48(%rdi), %rax
+; SSE2-NEXT: movdqa 48(%rdi), %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT: movq %xmm3, %rcx
-; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_10
; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm4
+; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: jmp .LBB87_12
; SSE2-NEXT: .LBB87_10:
-; SSE2-NEXT: movq %rcx, %rdx
-; SSE2-NEXT: shrq %rdx
-; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm4
+; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: andl $1, %eax
+; SSE2-NEXT: orq %rcx, %rax
+; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: addss %xmm4, %xmm4
; SSE2-NEXT: .LBB87_12:
-; SSE2-NEXT: movdqa 48(%rdi), %xmm5
+; SSE2-NEXT: movq %xmm6, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_13
; SSE2-NEXT: # %bb.14:
; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: addss %xmm3, %xmm3
; SSE2-NEXT: .LBB87_15:
-; SSE2-NEXT: movq 32(%rdi), %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; SSE2-NEXT: movq %xmm5, %rcx
-; SSE2-NEXT: testq %rcx, %rcx
+; SSE2-NEXT: movdqa 32(%rdi), %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE2-NEXT: movq %xmm6, %rax
+; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_16
; SSE2-NEXT: # %bb.17:
-; SSE2-NEXT: xorps %xmm5, %xmm5
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm5
+; SSE2-NEXT: xorps %xmm6, %xmm6
+; SSE2-NEXT: cvtsi2ss %rax, %xmm6
; SSE2-NEXT: jmp .LBB87_18
; SSE2-NEXT: .LBB87_16:
-; SSE2-NEXT: movq %rcx, %rdx
-; SSE2-NEXT: shrq %rdx
-; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: xorps %xmm5, %xmm5
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm5
-; SSE2-NEXT: addss %xmm5, %xmm5
+; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: andl $1, %eax
+; SSE2-NEXT: orq %rcx, %rax
+; SSE2-NEXT: xorps %xmm6, %xmm6
+; SSE2-NEXT: cvtsi2ss %rax, %xmm6
+; SSE2-NEXT: addss %xmm6, %xmm6
; SSE2-NEXT: .LBB87_18:
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: movdqa 32(%rdi), %xmm4
+; SSE2-NEXT: movq %xmm5, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_19
; SSE2-NEXT: # %bb.20:
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB87_21:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_22
;
; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT: vmovaps (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4
+; AVX512F-NEXT: vmovq %xmm2, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT: vmovq %xmm3, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovq %xmm2, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT: vmovq %xmm3, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: