; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0
-; AVX2-NEXT: vpextrq $1, %xmm4, %r14
-; AVX2-NEXT: vmovq %xmm4, %r13
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT: vmovd %xmm4, %r12d
-; AVX2-NEXT: vpextrd $2, %xmm4, %r15d
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %r15
+; AVX2-NEXT: vmovq %xmm2, %r14
+; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %r13
+; AVX2-NEXT: vmovq %xmm1, %r11
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vmovd %xmm9, %r12d
+; AVX2-NEXT: vpextrd $2, %xmm9, %r9d
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX2-NEXT: vmovd %xmm7, %ecx
; AVX2-NEXT: vpextrd $2, %xmm7, %edi
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX2-NEXT: vmovd %xmm6, %ebx
-; AVX2-NEXT: vpextrd $2, %xmm6, %esi
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT: vmovd %xmm5, %edx
-; AVX2-NEXT: vpextrd $2, %xmm5, %ebp
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vpextrd $2, %xmm6, %eax
+; AVX2-NEXT: vmovd %xmm5, %ebx
+; AVX2-NEXT: vpextrd $2, %xmm5, %esi
+; AVX2-NEXT: vmovd %xmm4, %edx
+; AVX2-NEXT: vpextrd $2, %xmm4, %ebp
+; AVX2-NEXT: vpextrd $2, %xmm1, %eax
; AVX2-NEXT: leal -1(%rbp,%rax), %eax
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: vmovd %xmm6, %eax
+; AVX2-NEXT: vmovd %xmm1, %eax
; AVX2-NEXT: leal -1(%rdx,%rax), %eax
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: vpextrd $2, %xmm7, %eax
-; AVX2-NEXT: leal -1(%rsi,%rax), %r11d
-; AVX2-NEXT: vmovd %xmm7, %eax
+; AVX2-NEXT: vpextrd $2, %xmm8, %eax
+; AVX2-NEXT: leal -1(%rsi,%rax), %eax
+; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: vmovd %xmm8, %eax
; AVX2-NEXT: leal -1(%rbx,%rax), %r10d
-; AVX2-NEXT: vpextrd $2, %xmm5, %eax
-; AVX2-NEXT: leal -1(%rdi,%rax), %r9d
-; AVX2-NEXT: vmovd %xmm5, %eax
-; AVX2-NEXT: leal -1(%rcx,%rax), %r8d
+; AVX2-NEXT: vpextrd $2, %xmm6, %eax
+; AVX2-NEXT: leal -1(%rdi,%rax), %r8d
+; AVX2-NEXT: vmovd %xmm6, %eax
+; AVX2-NEXT: leal -1(%rcx,%rax), %edi
; AVX2-NEXT: vpextrd $2, %xmm3, %eax
-; AVX2-NEXT: leal -1(%r15,%rax), %r15d
+; AVX2-NEXT: leal -1(%r9,%rax), %r9d
; AVX2-NEXT: vmovd %xmm3, %ecx
; AVX2-NEXT: leal -1(%r12,%rcx), %r12d
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: leal -1(%r15,%rcx), %r15d
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: leal -1(%r14,%rcx), %r14d
; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: leaq -1(%rax,%rdx), %rdx
-; AVX2-NEXT: vmovq %xmm2, %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: leaq -1(%rax,%rsi), %rsi
-; AVX2-NEXT: vmovq %xmm4, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: leaq -1(%rax,%rbx), %rbx
-; AVX2-NEXT: vpextrq $1, %xmm4, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: leaq -1(%rax,%rbp), %rbp
-; AVX2-NEXT: vmovq %xmm1, %rdi
-; AVX2-NEXT: leaq -1(%r13,%rdi), %rdi
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: leaq -1(%r14,%rax), %rax
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vmovq %xmm1, %r13
-; AVX2-NEXT: leaq -1(%rcx,%r13), %r13
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: vpextrq $1, %xmm1, %r14
-; AVX2-NEXT: leaq -1(%rcx,%r14), %rcx
-; AVX2-NEXT: shrq %rsi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: shrq %rdx
-; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX2-NEXT: shrq %rbx
-; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: leal -1(%rax,%rdx), %edx
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm0
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: leal -1(%rcx,%rax), %eax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: leal -1(%r13,%rsi), %esi
+; AVX2-NEXT: vmovq %xmm0, %rbx
+; AVX2-NEXT: leal -1(%r11,%rbx), %ebx
+; AVX2-NEXT: vpextrq $1, %xmm10, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm11, %r13
+; AVX2-NEXT: leal -1(%rcx,%r13), %ecx
+; AVX2-NEXT: vmovq %xmm10, %r13
+; AVX2-NEXT: vmovq %xmm11, %r11
+; AVX2-NEXT: leaq -1(%r13,%r11), %rbp
; AVX2-NEXT: shrq %rbp
-; AVX2-NEXT: vpinsrb $3, %ebp, %xmm0, %xmm0
-; AVX2-NEXT: shrq %rdi
-; AVX2-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shrq %r13
-; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %ebp, %xmm0
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl %ebx
+; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: shrl %esi
+; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shrl %edx
+; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX2-NEXT: shrl %r14d
+; AVX2-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shrl %r15d
+; AVX2-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
; AVX2-NEXT: shrl %r12d
; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r15d
-; AVX2-NEXT: vpinsrb $9, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r8d
-; AVX2-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
; AVX2-NEXT: shrl %r9d
-; AVX2-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shrl %edi
+; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; AVX2-NEXT: shrl %r8d
+; AVX2-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
; AVX2-NEXT: shrl %r10d
; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r11d
-; AVX2-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; AVX2-NEXT: shrl %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512-NEXT: pushq %r12
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0
-; AVX512-NEXT: vpextrq $1, %xmm4, %r14
-; AVX512-NEXT: vmovq %xmm4, %r13
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm5
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm7
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vmovd %xmm4, %r12d
-; AVX512-NEXT: vpextrd $2, %xmm4, %r15d
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %r15
+; AVX512-NEXT: vmovq %xmm2, %r14
+; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm1, %r13
+; AVX512-NEXT: vmovq %xmm1, %r11
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm1
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: vmovd %xmm9, %r12d
+; AVX512-NEXT: vpextrd $2, %xmm9, %r9d
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512-NEXT: vmovd %xmm7, %ecx
; AVX512-NEXT: vpextrd $2, %xmm7, %edi
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX512-NEXT: vmovd %xmm6, %ebx
-; AVX512-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm6
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-NEXT: vmovd %xmm5, %edx
-; AVX512-NEXT: vpextrd $2, %xmm5, %ebp
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-NEXT: vpextrd $2, %xmm6, %eax
+; AVX512-NEXT: vmovd %xmm5, %ebx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: vmovd %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm4, %ebp
+; AVX512-NEXT: vpextrd $2, %xmm1, %eax
; AVX512-NEXT: leal -1(%rbp,%rax), %eax
; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: vmovd %xmm6, %eax
+; AVX512-NEXT: vmovd %xmm1, %eax
; AVX512-NEXT: leal -1(%rdx,%rax), %eax
; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: vpextrd $2, %xmm7, %eax
-; AVX512-NEXT: leal -1(%rsi,%rax), %r11d
-; AVX512-NEXT: vmovd %xmm7, %eax
+; AVX512-NEXT: vpextrd $2, %xmm8, %eax
+; AVX512-NEXT: leal -1(%rsi,%rax), %eax
+; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512-NEXT: vmovd %xmm8, %eax
; AVX512-NEXT: leal -1(%rbx,%rax), %r10d
-; AVX512-NEXT: vpextrd $2, %xmm5, %eax
-; AVX512-NEXT: leal -1(%rdi,%rax), %r9d
-; AVX512-NEXT: vmovd %xmm5, %eax
-; AVX512-NEXT: leal -1(%rcx,%rax), %r8d
+; AVX512-NEXT: vpextrd $2, %xmm6, %eax
+; AVX512-NEXT: leal -1(%rdi,%rax), %r8d
+; AVX512-NEXT: vmovd %xmm6, %eax
+; AVX512-NEXT: leal -1(%rcx,%rax), %edi
; AVX512-NEXT: vpextrd $2, %xmm3, %eax
-; AVX512-NEXT: leal -1(%r15,%rax), %r15d
+; AVX512-NEXT: leal -1(%r9,%rax), %r9d
; AVX512-NEXT: vmovd %xmm3, %ecx
; AVX512-NEXT: leal -1(%r12,%rcx), %r12d
+; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT: leal -1(%r15,%rcx), %r15d
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: leal -1(%r14,%rcx), %r14d
; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: leaq -1(%rax,%rdx), %rdx
-; AVX512-NEXT: vmovq %xmm2, %rsi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: leaq -1(%rax,%rsi), %rsi
-; AVX512-NEXT: vmovq %xmm4, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: leaq -1(%rax,%rbx), %rbx
-; AVX512-NEXT: vpextrq $1, %xmm4, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: leaq -1(%rax,%rbp), %rbp
-; AVX512-NEXT: vmovq %xmm1, %rdi
-; AVX512-NEXT: leaq -1(%r13,%rdi), %rdi
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: leaq -1(%r14,%rax), %rax
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %r13
-; AVX512-NEXT: leaq -1(%rcx,%r13), %r13
-; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512-NEXT: leaq -1(%rcx,%r14), %rcx
-; AVX512-NEXT: shrq %rsi
-; AVX512-NEXT: vmovd %esi, %xmm0
-; AVX512-NEXT: shrq %rdx
-; AVX512-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX512-NEXT: shrq %rbx
-; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX512-NEXT: leal -1(%rax,%rdx), %edx
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: leal -1(%rcx,%rax), %eax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: leal -1(%r13,%rsi), %esi
+; AVX512-NEXT: vmovq %xmm0, %rbx
+; AVX512-NEXT: leal -1(%r11,%rbx), %ebx
+; AVX512-NEXT: vpextrq $1, %xmm10, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm11, %r13
+; AVX512-NEXT: leal -1(%rcx,%r13), %ecx
+; AVX512-NEXT: vmovq %xmm10, %r13
+; AVX512-NEXT: vmovq %xmm11, %r11
+; AVX512-NEXT: leaq -1(%r13,%r11), %rbp
; AVX512-NEXT: shrq %rbp
-; AVX512-NEXT: vpinsrb $3, %ebp, %xmm0, %xmm0
-; AVX512-NEXT: shrq %rdi
-; AVX512-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; AVX512-NEXT: shrq %rax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512-NEXT: shrq %r13
-; AVX512-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
-; AVX512-NEXT: shrq %rcx
-; AVX512-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ebp, %xmm0
+; AVX512-NEXT: shrl %ecx
+; AVX512-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: shrl %ebx
+; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX512-NEXT: shrl %esi
+; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; AVX512-NEXT: shrl %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512-NEXT: shrl %edx
+; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX512-NEXT: shrl %r14d
+; AVX512-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
+; AVX512-NEXT: shrl %r15d
+; AVX512-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
; AVX512-NEXT: shrl %r12d
; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r15d
-; AVX512-NEXT: vpinsrb $9, %r15d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r8d
-; AVX512-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
; AVX512-NEXT: shrl %r9d
-; AVX512-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
+; AVX512-NEXT: shrl %edi
+; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; AVX512-NEXT: shrl %r8d
+; AVX512-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
; AVX512-NEXT: shrl %r10d
; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r11d
-; AVX512-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX512-NEXT: shrl %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; AVX512-NEXT: shrl %eax
; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; SSE41-NEXT: shlq $57, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrw $6, %ecx, %xmm0
-; SSE41-NEXT: shrq $7, %rax
+; SSE41-NEXT: shrl $7, %eax
; SSE41-NEXT: pinsrw $7, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: shlq $57, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrq $7, %rax
+; AVX1-NEXT: shrl $7, %eax
; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-NEXT: shlq $57, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrq $7, %rax
+; AVX2-NEXT: shrl $7, %eax
; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $6, %ecx, %xmm0
; SSE41-NEXT: movsbq %al, %rcx
-; SSE41-NEXT: shrq $7, %rcx
+; SSE41-NEXT: shrl $7, %ecx
; SSE41-NEXT: pinsrb $7, %ecx, %xmm0
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $55, %rcx
; SSE41-NEXT: shlq $49, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
-; SSE41-NEXT: shrq $15, %rax
+; SSE41-NEXT: shrl $15, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; AVX1-NEXT: movsbq %al, %rcx
-; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: shrl $7, %ecx
; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $55, %rcx
; AVX1-NEXT: shlq $49, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrq $15, %rax
+; AVX1-NEXT: shrl $15, %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; AVX2-NEXT: movsbq %al, %rcx
-; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: shrl $7, %ecx
; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $55, %rcx
; AVX2-NEXT: shlq $49, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrq $15, %rax
+; AVX2-NEXT: shrl $15, %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX1-NEXT: movq %rax, %r11
; AVX1-NEXT: movq %rax, %r14
; AVX1-NEXT: movq %rax, %r15
-; AVX1-NEXT: movq %rax, %r9
+; AVX1-NEXT: movl %eax, %r9d
; AVX1-NEXT: movq %rax, %r12
; AVX1-NEXT: movq %rax, %r13
; AVX1-NEXT: movq %rax, %rbx
; AVX1-NEXT: shlq $49, %r15
; AVX1-NEXT: sarq $63, %r15
; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $15, %r9
+; AVX1-NEXT: shrl $15, %r9d
; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
; AVX1-NEXT: shlq $63, %r13
; AVX1-NEXT: sarq $63, %r13
; AVX1-NEXT: shlq $57, %rsi
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
-; AVX1-NEXT: shrq $7, %rbp
+; AVX1-NEXT: shrl $7, %ebp
; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
; AVX2-NEXT: movq %rax, %r11
; AVX2-NEXT: movq %rax, %r14
; AVX2-NEXT: movq %rax, %r15
-; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movl %eax, %r9d
; AVX2-NEXT: movq %rax, %r12
; AVX2-NEXT: movq %rax, %r13
; AVX2-NEXT: movq %rax, %rbx
; AVX2-NEXT: shlq $49, %r15
; AVX2-NEXT: sarq $63, %r15
; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $15, %r9
+; AVX2-NEXT: shrl $15, %r9d
; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
; AVX2-NEXT: shlq $63, %r13
; AVX2-NEXT: sarq $63, %r13
; AVX2-NEXT: shlq $57, %rsi
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
-; AVX2-NEXT: shrq $7, %rbp
+; AVX2-NEXT: shrl $7, %ebp
; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $6, %ecx, %xmm0
; SSE41-NEXT: movsbq %al, %rcx
-; SSE41-NEXT: shrq $7, %rcx
+; SSE41-NEXT: shrl $7, %ecx
; SSE41-NEXT: pinsrb $7, %ecx, %xmm0
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $55, %rcx
; SSE41-NEXT: shlq $49, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
-; SSE41-NEXT: shrq $15, %rax
+; SSE41-NEXT: shrl $15, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: movswq 2(%rdi), %rax
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $6, %ecx, %xmm1
; SSE41-NEXT: movsbq %al, %rcx
-; SSE41-NEXT: shrq $7, %rcx
+; SSE41-NEXT: shrl $7, %ecx
; SSE41-NEXT: pinsrb $7, %ecx, %xmm1
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $55, %rcx
; SSE41-NEXT: shlq $49, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $14, %ecx, %xmm1
-; SSE41-NEXT: shrq $15, %rax
+; SSE41-NEXT: shrl $15, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm1
; SSE41-NEXT: retq
;
; AVX1-NEXT: shlq $57, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX1-NEXT: shrq $7, %r11
+; AVX1-NEXT: shrl $7, %r11d
; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1
; AVX1-NEXT: shlq $55, %r9
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: shlq $49, %rdx
; AVX1-NEXT: sarq $63, %rdx
; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX1-NEXT: shrq $15, %rax
+; AVX1-NEXT: shrl $15, %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
; AVX2-NEXT: shlq $57, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX2-NEXT: shrq $7, %r11
+; AVX2-NEXT: shrl $7, %r11d
; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1
; AVX2-NEXT: shlq $55, %r9
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: shlq $49, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX2-NEXT: shrq $15, %rax
+; AVX2-NEXT: shrl $15, %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
; SSE41-NEXT: shlq $57, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrw $6, %ecx, %xmm0
-; SSE41-NEXT: shrq $7, %rax
+; SSE41-NEXT: shrl $7, %eax
; SSE41-NEXT: pinsrw $7, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: shlq $57, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrq $7, %rax
+; AVX1-NEXT: shrl $7, %eax
; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-NEXT: shlq $57, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrq $7, %rax
+; AVX2-NEXT: shrl $7, %eax
; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $6, %ecx, %xmm0
; SSE41-NEXT: movsbq %al, %rcx
-; SSE41-NEXT: shrq $7, %rcx
+; SSE41-NEXT: shrl $7, %ecx
; SSE41-NEXT: pinsrb $7, %ecx, %xmm0
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $55, %rcx
; SSE41-NEXT: shlq $49, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
-; SSE41-NEXT: shrq $15, %rax
+; SSE41-NEXT: shrl $15, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; AVX1-NEXT: movsbq %al, %rcx
-; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: shrl $7, %ecx
; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $55, %rcx
; AVX1-NEXT: shlq $49, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrq $15, %rax
+; AVX1-NEXT: shrl $15, %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; AVX2-NEXT: movsbq %al, %rcx
-; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: shrl $7, %ecx
; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $55, %rcx
; AVX2-NEXT: shlq $49, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrq $15, %rax
+; AVX2-NEXT: shrl $15, %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX1-NEXT: movq %rax, %r11
; AVX1-NEXT: movq %rax, %r14
; AVX1-NEXT: movq %rax, %r15
-; AVX1-NEXT: movq %rax, %r9
+; AVX1-NEXT: movl %eax, %r9d
; AVX1-NEXT: movq %rax, %r12
; AVX1-NEXT: movq %rax, %r13
; AVX1-NEXT: movq %rax, %rbx
; AVX1-NEXT: shlq $49, %r15
; AVX1-NEXT: sarq $63, %r15
; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $15, %r9
+; AVX1-NEXT: shrl $15, %r9d
; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
; AVX1-NEXT: shlq $63, %r13
; AVX1-NEXT: sarq $63, %r13
; AVX1-NEXT: shlq $57, %rsi
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
-; AVX1-NEXT: shrq $7, %rbp
+; AVX1-NEXT: shrl $7, %ebp
; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
; AVX2-NEXT: movq %rax, %r11
; AVX2-NEXT: movq %rax, %r14
; AVX2-NEXT: movq %rax, %r15
-; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movl %eax, %r9d
; AVX2-NEXT: movq %rax, %r12
; AVX2-NEXT: movq %rax, %r13
; AVX2-NEXT: movq %rax, %rbx
; AVX2-NEXT: shlq $49, %r15
; AVX2-NEXT: sarq $63, %r15
; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $15, %r9
+; AVX2-NEXT: shrl $15, %r9d
; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
; AVX2-NEXT: shlq $63, %r13
; AVX2-NEXT: sarq $63, %r13
; AVX2-NEXT: shlq $57, %rsi
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
-; AVX2-NEXT: shrq $7, %rbp
+; AVX2-NEXT: shrl $7, %ebp
; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $6, %ecx, %xmm0
; SSE41-NEXT: movsbq %al, %rcx
-; SSE41-NEXT: shrq $7, %rcx
+; SSE41-NEXT: shrl $7, %ecx
; SSE41-NEXT: pinsrb $7, %ecx, %xmm0
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $55, %rcx
; SSE41-NEXT: shlq $49, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
-; SSE41-NEXT: shrq $15, %rax
+; SSE41-NEXT: shrl $15, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: movswq 2(%rdi), %rax
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $6, %ecx, %xmm1
; SSE41-NEXT: movsbq %al, %rcx
-; SSE41-NEXT: shrq $7, %rcx
+; SSE41-NEXT: shrl $7, %ecx
; SSE41-NEXT: pinsrb $7, %ecx, %xmm1
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $55, %rcx
; SSE41-NEXT: shlq $49, %rcx
; SSE41-NEXT: sarq $63, %rcx
; SSE41-NEXT: pinsrb $14, %ecx, %xmm1
-; SSE41-NEXT: shrq $15, %rax
+; SSE41-NEXT: shrl $15, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm1
; SSE41-NEXT: retq
;
; AVX1-NEXT: shlq $57, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX1-NEXT: shrq $7, %r11
+; AVX1-NEXT: shrl $7, %r11d
; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1
; AVX1-NEXT: shlq $55, %r9
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: shlq $49, %rdx
; AVX1-NEXT: sarq $63, %rdx
; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX1-NEXT: shrq $15, %rax
+; AVX1-NEXT: shrl $15, %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
; AVX2-NEXT: shlq $57, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX2-NEXT: shrq $7, %r11
+; AVX2-NEXT: shrl $7, %r11d
; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1
; AVX2-NEXT: shlq $55, %r9
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: shlq $49, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX2-NEXT: shrq $15, %rax
+; AVX2-NEXT: shrl $15, %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx