; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: kxorw %k0, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k2, %k2
-; NoVLX-NEXT: kshiftlw $1, %k2, %k2
-; NoVLX-NEXT: korw %k1, %k2, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k2
-; NoVLX-NEXT: kxorw %k0, %k2, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
-; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kshiftrw $2, %k0, %k1
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k2
-; NoVLX-NEXT: kxorw %k2, %k1, %k1
-; NoVLX-NEXT: kshiftlw $15, %k1, %k1
-; NoVLX-NEXT: kshiftrw $13, %k1, %k1
-; NoVLX-NEXT: kxorw %k0, %k1, %k0
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k2
-; NoVLX-NEXT: kxorw %k2, %k1, %k1
-; NoVLX-NEXT: kshiftlw $15, %k1, %k1
-; NoVLX-NEXT: kshiftrw $12, %k1, %k1
-; NoVLX-NEXT: kxorw %k0, %k1, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: def %al killed %al killed %eax
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <2 x i64> %__a to <4 x i32>
- %1 = bitcast <2 x i64> %__b to <4 x i32>
- %2 = icmp eq <4 x i32> %0, %1
- %3 = bitcast i8 %__u to <8 x i1>
- %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %4 = and <4 x i1> %2, %extract.i
- %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %6 = bitcast <8 x i1> %5 to i8
- ret i8 %6
-}
-
-define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; VLX: # %bb.0: # %entry
-; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: def %al killed %al killed %eax
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
- %load = load <2 x i64>, <2 x i64>* %__b
- %1 = bitcast <2 x i64> %load to <4 x i32>
- %2 = icmp eq <4 x i32> %0, %1
- %3 = bitcast i8 %__u to <8 x i1>
- %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %4 = and <4 x i1> %2, %extract.i
- %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %6 = bitcast <8 x i1> %5 to i8
- ret i8 %6
-}
-
-
-define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; VLX: # %bb.0: # %entry
-; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: def %al killed %al killed %eax
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: kxorw %k0, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k2, %k2
-; NoVLX-NEXT: kshiftlw $1, %k2, %k2
-; NoVLX-NEXT: korw %k1, %k2, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k2
-; NoVLX-NEXT: kxorw %k0, %k2, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
-; NoVLX-NEXT: kxorw %k1, %k0, %k0
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k2
-; NoVLX-NEXT: kxorw %k2, %k1, %k1
-; NoVLX-NEXT: kshiftlw $15, %k1, %k1
-; NoVLX-NEXT: kshiftrw $13, %k1, %k1
-; NoVLX-NEXT: kxorw %k0, %k1, %k0
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k2
-; NoVLX-NEXT: kxorw %k2, %k1, %k1
-; NoVLX-NEXT: kshiftlw $15, %k1, %k1
-; NoVLX-NEXT: kshiftrw $12, %k1, %k1
-; NoVLX-NEXT: kxorw %k0, %k1, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: def %al killed %al killed %eax
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <2 x i64> %__a to <4 x i32>
+ %1 = bitcast <2 x i64> %__b to <4 x i32>
+ %2 = icmp eq <4 x i32> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = and <4 x i1> %2, %extract.i
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x i32>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x i32>
+ %2 = icmp eq <4 x i32> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = and <4 x i1> %2, %extract.i
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
%vec = insertelement <4 x i32> undef, i32 %load, i32 0
%1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: kxorw %k0, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k2, %k2
-; NoVLX-NEXT: kshiftlw $1, %k2, %k2
-; NoVLX-NEXT: korw %k1, %k2, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k2
-; NoVLX-NEXT: kxorw %k0, %k2, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
-; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kshiftrw $2, %k0, %k1
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k2
-; NoVLX-NEXT: kxorw %k2, %k1, %k1
-; NoVLX-NEXT: kshiftlw $15, %k1, %k1
-; NoVLX-NEXT: kshiftrw $13, %k1, %k1
-; NoVLX-NEXT: kxorw %k0, %k1, %k0
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k2
-; NoVLX-NEXT: kxorw %k2, %k1, %k1
-; NoVLX-NEXT: kshiftlw $15, %k1, %k1
-; NoVLX-NEXT: kshiftrw $12, %k1, %k1
-; NoVLX-NEXT: kxorw %k0, %k1, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: def %al killed %al killed %eax
-; NoVLX-NEXT: retq
-entry:
- %0 = bitcast <2 x i64> %__a to <4 x i32>
- %1 = bitcast <2 x i64> %__b to <4 x i32>
- %2 = icmp sge <4 x i32> %0, %1
- %3 = bitcast i8 %__u to <8 x i1>
- %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %4 = and <4 x i1> %2, %extract.i
- %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %6 = bitcast <8 x i1> %5 to i8
- ret i8 %6
-}
-
-define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
-; VLX: # %bb.0: # %entry
-; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: def %al killed %al killed %eax
-; VLX-NEXT: retq
-;
-; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
-; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x i32>
+ %1 = bitcast <2 x i64> %__b to <4 x i32>
+ %2 = icmp sge <4 x i32> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = and <4 x i1> %2, %extract.i
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $2, %k0, %k2
-; NoVLX-NEXT: kshiftrw $1, %k0, %k3
-; NoVLX-NEXT: kmovw %k3, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovaps (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vmovaps (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftrw $3, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0