The previous implementation (not custom) doesn't enforce zeroing off upper bits. The assumption is that i1 PRODUCER (truncate and extractelement) must zero all upper bits, so i1 CONSUMER instructions ( test, zext, save, etc) can be done without additional zeroing.
Make extractelement i1 lowering custom for all vector i1.
Differential Revision: http://reviews.llvm.org/D23246
llvm-svn: 278328
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
- setOperationAction(ISD::ADD, MVT::v2i1, Expand);
- setOperationAction(ISD::ADD, MVT::v4i1, Expand);
- setOperationAction(ISD::SUB, MVT::v2i1, Expand);
- setOperationAction(ISD::SUB, MVT::v4i1, Expand);
- setOperationAction(ISD::MUL, MVT::v2i1, Expand);
- setOperationAction(ISD::MUL, MVT::v4i1, Expand);
-
- setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
+ setOperationAction(ISD::ADD, VT, Expand);
+ setOperationAction(ISD::SUB, VT, Expand);
+ setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ }
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::SELECT, MVT::v4i1, Custom);
- setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2i1, Expand);
- setOperationAction(ISD::VSELECT, MVT::v4i1, Expand);
for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
setOperationAction(ISD::AND, VT, Legal);
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
+ if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
+ (VecVT.getVectorNumElements() < 8)) {
// Use kshiftlw/rw instruction.
VecVT = MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
DAG.getIntPtrConstant(0, dl));
}
unsigned MaxSift = VecVT.getVectorNumElements() - 1;
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
- DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+ if (MaxSift - IdxVal)
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift, dl, MVT::i8));
return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
-let Predicates = [HasAVX512] in {
- def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK16:$src, VK1)>;
- def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK8:$src, VK1)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK32:$src, VK1)>;
- def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK64:$src, VK1)>;
-}
+def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>;
// Mask unary operation
// - KNOT
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %r14d
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
-; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vmovd %r15d, %xmm4
; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftlw $14, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftlw $15, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $2, %r12d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $13, %k2, %k0
+; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftlw $12, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $4, %r13d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftlw $11, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftlw $10, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $6, %esi, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftlw $9, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $7, %edi, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftlw $8, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $8, %r8d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %edi
-; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftlw $7, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $9, %r9d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftlw $6, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $10, %r10d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftlw $5, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $11, %r11d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftlw $4, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $12, %ebx, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftlw $3, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $13, %ebp, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $2, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $14, %r14d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r11d
-; KNL-NEXT: kshiftlw $1, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $15, %r15d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
-; KNL-NEXT: kshiftlw $0, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %eax, %xmm5
-; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %r12d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %edx, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %edi
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $12, %ebp, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %r11d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: kshiftlw $2, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $14, %r14d, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %r11d
; KNL-NEXT: kshiftlw $1, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $15, %r15d, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: vptestmd %zmm7, %zmm7, %k0
-; KNL-NEXT: kshiftlw $0, %k1, %k1
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: vmovd %eax, %xmm5
; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5
; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $3, %edx, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: vpinsrb $3, %edx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $4, %r13d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5
; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $10, %ebx, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $11, %ebp, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: vpinsrb $12, %ebp, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $13, %r11d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vpinsrb $13, %r11d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $14, %r14d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k1, %r15d
-; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $3, %edx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $10, %ebx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $11, %ebp, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %r11d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftrw $15, %k1, %k0
; KNL-NEXT: vmovd %r12d, %xmm7
; KNL-NEXT: kmovw %k0, %r12d
; KNL-NEXT: vpinsrb $1, %ecx, %xmm7, %xmm7
%r = insertelement <32 x i8> %x, i8 %y, i32 20
ret <32 x i8> %r
}
+
+define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
+; KNL-LABEL: test_extractelement_v2i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: testb $1, %al
+; KNL-NEXT: sete %al
+; KNL-NEXT: addb $3, %al
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_v2i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
+; SKX-NEXT: kshiftlw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: sete %al
+; SKX-NEXT: addb $3, %al
+; SKX-NEXT: movzbl %al, %eax
+; SKX-NEXT: retq
+ %t1 = icmp ugt <2 x i64> %a, %b
+ %t2 = extractelement <2 x i1> %t1, i32 0
+ %res = select i1 %t2, i8 3, i8 4
+ ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
+; KNL-LABEL: test_extractelement_v4i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpextrd $3, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_v4i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
+; SKX-NEXT: kshiftlw $12, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: retq
+ %t1 = icmp ugt <4 x i32> %a, %b
+ %t2 = extractelement <4 x i1> %t1, i32 3
+ %res = zext i1 %t2 to i8
+ ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
+; KNL-LABEL: test_extractelement_v32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpextrb $2, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_v32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
+; SKX-NEXT: kshiftld $29, %k0, %k0
+; SKX-NEXT: kshiftrd $31, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: retq
+ %t1 = icmp ugt <32 x i8> %a, %b
+ %t2 = extractelement <32 x i1> %t1, i32 2
+ %res = zext i1 %t2 to i8
+ ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: test_extractelement_v64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2
+; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrb $15, %xmm0, %eax
+; KNL-NEXT: testb $1, %al
+; KNL-NEXT: sete %al
+; KNL-NEXT: addb $3, %al
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_v64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftrq $63, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: sete %al
+; SKX-NEXT: addb $3, %al
+; SKX-NEXT: movzbl %al, %eax
+; SKX-NEXT: retq
+ %t1 = icmp ugt <64 x i8> %a, %b
+ %t2 = extractelement <64 x i1> %t1, i32 63
+ %res = select i1 %t2, i8 3, i8 4
+ ret i8 %res
+}
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $0, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $0, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
; KNL-NEXT: vmovd %r9d, %xmm3
; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
; KNL-NEXT: vmovd %r10d, %xmm2
; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: kshiftlw $0, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k2, %k0
; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
; KNL-NEXT: vmovd %r10d, %xmm1
; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kshiftlw $0, %k1, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
-; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: kshiftlw $15, %k1, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
-; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: # implicit-def: %XMM0
; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_2
; SKX-NEXT: vmovq %xmm1, %rax
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: .LBB29_2: # %else
-; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: kshiftlw $14, %k1, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_4
; SKX-NEXT: # BB#3: # %cond.load1
; SKX-NEXT: vpextrq $1, %xmm1, %rax
; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
; SKX-NEXT: .LBB29_4: # %else2
-; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SKX-NEXT: kshiftlw $13, %k1, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_6
; SKX-NEXT: # BB#5: # %cond.load4
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
-; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
+; SKX_32-NEXT: kshiftlw $15, %k1, %k0
+; SKX_32-NEXT: kshiftrw $15, %k0, %k0
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT: kmovw %k0, %eax
; SKX_32-NEXT: # implicit-def: %XMM0
; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_2
; SKX_32-NEXT: vmovd %xmm1, %eax
; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX_32-NEXT: .LBB29_2: # %else
-; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
-; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT: kshiftlw $14, %k1, %k0
+; SKX_32-NEXT: kshiftrw $15, %k0, %k0
+; SKX_32-NEXT: kmovw %k0, %eax
; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_4
; SKX_32-NEXT: # BB#3: # %cond.load1
; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB29_4: # %else2
; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2
-; SKX_32-NEXT: kmovb %k1, (%esp)
-; SKX_32-NEXT: movb (%esp), %al
+; SKX_32-NEXT: kshiftlw $13, %k1, %k0
+; SKX_32-NEXT: kshiftrw $15, %k0, %k0
+; SKX_32-NEXT: kmovw %k0, %eax
; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_6
; SKX_32-NEXT: # BB#5: # %cond.load4
; AVX512F-NEXT: ## BB#29: ## %cond.load40
; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm0
; AVX512F-NEXT: LBB50_30: ## %else41
-; AVX512F-NEXT: kshiftlw $0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: je LBB50_32
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: LBB52_30: ## %else41
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT: kshiftlw $0, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: LBB52_62: ## %else89
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: kshiftlw $0, %k1, %k1
; AVX512F-NEXT: kshiftrw $15, %k1, %k1
; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
; AVX512F-NEXT: kmovw %k1, %eax
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: LBB52_94: ## %else137
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: kshiftlw $0, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: vpinsrb $14, 62(%rdi), %xmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: LBB52_126: ## %else185
-; AVX512F-NEXT: kshiftlw $0, %k1, %k1
; AVX512F-NEXT: kshiftrw $15, %k1, %k1
; AVX512F-NEXT: kmovw %k1, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: LBB54_30: ## %else41
-; AVX512F-NEXT: kshiftlw $0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: je LBB54_32
; AVX512F-NEXT: ## BB#29: ## %cond.store27
; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
; AVX512F-NEXT: LBB56_30: ## %else28
-; AVX512F-NEXT: kshiftlw $0, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: vpextrb $14, %xmm4, 14(%rdi)
; AVX512F-NEXT: LBB58_30: ## %else28
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kshiftlw $0, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: vpextrb $14, %xmm1, 30(%rdi)
; AVX512F-NEXT: LBB58_62: ## %else60
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $0, %k1, %k1
; AVX512F-NEXT: kshiftrw $15, %k1, %k1
; AVX512F-NEXT: kmovw %k1, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: vpextrb $14, %xmm5, 46(%rdi)
; AVX512F-NEXT: LBB58_94: ## %else92
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kshiftlw $0, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
; AVX512F-NEXT: vpextrb $14, %xmm0, 62(%rdi)
; AVX512F-NEXT: LBB58_126: ## %else124
-; AVX512F-NEXT: kshiftlw $0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: je LBB58_128
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi)
; AVX512F-NEXT: LBB60_30: ## %else28
-; AVX512F-NEXT: kshiftlw $0, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $0, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k0
; AVX512-NEXT: kmovw %k0, %eax
; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $0, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k0
; AVX512-NEXT: kmovw %k0, %eax
; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $0, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k0
; AVX512-NEXT: kmovw %k0, %eax
; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $0, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k0
; AVX512-NEXT: kmovw %k0, %eax
; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0