From 93e1f73b6b2dc751a27568da5797cc2ba8436385 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2019 06:06:11 +0000 Subject: [PATCH] [X86] Break non-power of 2 vXi1 vectors into scalars for argument passing with avx512. This generates worse code, but matches what is done for avx2 and prevents crashes when more arguments are passed than we have registers for. llvm-svn: 372200 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 25 +- llvm/test/CodeGen/X86/avx512-calling-conv.ll | 2288 +++++++++++++++++++----- llvm/test/CodeGen/X86/masked_gather_scatter.ll | 182 +- llvm/test/CodeGen/X86/masked_store.ll | 74 +- 4 files changed, 2030 insertions(+), 539 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f40e29b..7103b70 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1996,14 +1996,16 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // Break wide vXi1 vectors into scalars to match avx2 behavior. + // v32i1 vectors should be promoted to v32i8 to match avx2. + if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) + return MVT::v32i8; + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && - ((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) || + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return MVT::i8; - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) - return MVT::v32i8; // FIXME: Should we just make these types legal and custom split operations? if ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) @@ -2014,14 +2016,16 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // Break wide vXi1 vectors into scalars to match avx2 behavior. + // v32i1 vectors should be promoted to v32i8 to match avx2. + if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) + return 1; + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && - ((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) || + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return VT.getVectorNumElements(); - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) - return 1; // FIXME: Should we just make these types legal and custom split operations? if ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) @@ -2032,10 +2036,11 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { - // Break wide vXi1 vectors into scalars to match avx2 behavior. + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && - ((VT.getVectorNumElements() > 32 && !Subtarget.hasBWI()) || + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { RegisterVT = MVT::i8; IntermediateVT = MVT::i1; diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 9a31202..8901cee 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -531,187 +531,220 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: pushq %r12 ; KNL-NEXT: pushq %rbx ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k3 +; KNL-NEXT: kxorw %k1, %k3, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 -; KNL-NEXT: kshiftlw $2, %k0, %k0 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $14, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 -; KNL-NEXT: kshiftlw $4, %k0, %k0 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $2, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $15, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $12, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $2, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $15, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $4, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $15, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $2, %k2, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $15, %k4, %k4 -; KNL-NEXT: korw %k3, %k4, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $12, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kunpckbw %k0, %k1, %k0 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kxorw %k2, %k1, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kmovw %esi, %k3 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k3 +; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $2, %k1, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: kmovw %r8d, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 ; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $15, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $4, %k1, %k1 -; KNL-NEXT: kmovw %r8d, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $15, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $2, %k2, %k2 -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kmovw %esi, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $15, %k4, %k4 -; KNL-NEXT: korw %k3, %k4, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $12, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $15, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $2, %k2, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $15, %k4, %k4 -; KNL-NEXT: korw %k3, %k4, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $15, %k4, %k4 -; KNL-NEXT: korw %k3, %k4, %k3 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k4 -; KNL-NEXT: kshiftlw $1, %k4, %k4 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k5 -; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $15, %k5, %k5 -; KNL-NEXT: korw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $2, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kshiftrw $14, %k4, %k4 -; KNL-NEXT: korw %k3, %k4, %k3 -; KNL-NEXT: kshiftlw $4, %k2, %k2 -; KNL-NEXT: kshiftlw $12, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: korw %k2, %k3, %k2 -; KNL-NEXT: kunpckbw %k1, %k2, %k1 -; KNL-NEXT: kandw %k0, %k1, %k0 -; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: xorl %ecx, %ecx +; KNL-NEXT: cmpb $0, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl $65535, %edx ## imm = 0xFFFF +; KNL-NEXT: movl $0, %esi +; KNL-NEXT: cmovnel %edx, %esi +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: cmpb $0, {{[0-9]+}}(%rsp) +; KNL-NEXT: cmovnel %edx, %ecx +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kandw %k1, %k2, %k1 ; KNL-NEXT: kmovw %k1, %r8d -; KNL-NEXT: kshiftrw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $1, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r9d -; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $2, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftrw $4, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftrw $5, %k0, %k1 @@ -723,30 +756,29 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kshiftrw $8, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftrw $9, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftrw $10, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftrw $11, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $12, %k0, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $13, %k0, %k1 -; KNL-NEXT: andl {{[0-9]+}}(%rsp), %r11d -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: movb %r11b, 2(%rax) -; KNL-NEXT: kmovw %k0, %r11d -; KNL-NEXT: andl $1, %r11d ; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: leal (%r11,%r8,2), %r8d -; KNL-NEXT: kmovw %k1, %r11d -; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: movb %r8b, 2(%rax) +; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: andl $1, %r8d ; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: leal (%r8,%r9,4), %r8d -; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: leal (%r8,%r9,2), %r8d +; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: andl $1, %r10d -; KNL-NEXT: leal (%r8,%r10,8), %r8d +; KNL-NEXT: leal (%r8,%r10,4), %r8d +; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: andl $1, %r11d +; KNL-NEXT: leal (%r8,%r11,8), %r8d ; KNL-NEXT: andl $1, %r12d ; KNL-NEXT: shll $4, %r12d ; KNL-NEXT: orl %r8d, %r12d @@ -761,29 +793,29 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: andl $1, %ebx ; KNL-NEXT: shll $8, %ebx ; KNL-NEXT: orl %r13d, %ebx -; KNL-NEXT: andl $1, %edi -; KNL-NEXT: shll $9, %edi -; KNL-NEXT: orl %ebx, %edi +; KNL-NEXT: andl $1, %esi +; KNL-NEXT: shll $9, %esi +; KNL-NEXT: orl %ebx, %esi ; KNL-NEXT: andl $1, %ebp ; KNL-NEXT: shll $10, %ebp -; KNL-NEXT: orl %edi, %ebp +; KNL-NEXT: orl %esi, %ebp ; KNL-NEXT: orl %r15d, %ebp -; KNL-NEXT: andl $1, %edx -; KNL-NEXT: shll $11, %edx -; KNL-NEXT: andl $1, %esi -; KNL-NEXT: shll $12, %esi -; KNL-NEXT: orl %edx, %esi ; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: shll $13, %ecx -; KNL-NEXT: orl %esi, %ecx -; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: shll $14, %r11d -; KNL-NEXT: orl %ecx, %r11d +; KNL-NEXT: shll $11, %ecx +; KNL-NEXT: andl $1, %edx +; KNL-NEXT: shll $12, %edx +; KNL-NEXT: orl %ecx, %edx +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: shll $13, %edi +; KNL-NEXT: orl %edx, %edi ; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: shll $15, %r9d -; KNL-NEXT: orl %r11d, %r9d -; KNL-NEXT: orl %ebp, %r9d -; KNL-NEXT: movw %r9w, (%rax) +; KNL-NEXT: shll $14, %r9d +; KNL-NEXT: orl %edi, %r9d +; KNL-NEXT: andl $1, %r10d +; KNL-NEXT: shll $15, %r10d +; KNL-NEXT: orl %r9d, %r10d +; KNL-NEXT: orl %ebp, %r10d +; KNL-NEXT: movw %r10w, (%rax) ; KNL-NEXT: popq %rbx ; KNL-NEXT: popq %r12 ; KNL-NEXT: popq %r13 @@ -794,7 +826,285 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; ; SKX-LABEL: test16: ; SKX: ## %bb.0: -; SKX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: pushq %r15 +; SKX-NEXT: pushq %r14 +; SKX-NEXT: pushq %r13 +; SKX-NEXT: pushq %r12 +; SKX-NEXT: pushq %rbx +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: movq %rdi, %rax +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftld $31, %k0, %k0 +; SKX-NEXT: kshiftrd $30, %k0, %k0 +; SKX-NEXT: kxord %k0, %k2, %k2 +; SKX-NEXT: kshiftrd $2, %k2, %k3 +; SKX-NEXT: kxord %k1, %k3, %k1 +; SKX-NEXT: kshiftld $31, %k1, %k1 +; SKX-NEXT: kshiftrd $29, %k1, %k1 +; SKX-NEXT: kxord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $3, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $28, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $4, %k1, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $27, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $5, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $26, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $6, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $25, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $7, %k1, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $24, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $23, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $22, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $10, %k1, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $21, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $11, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $20, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $12, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $19, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $13, %k1, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $18, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $17, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $16, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $16, %k1, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $15, %k2, %k2 +; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kmovd %ecx, %k2 +; SKX-NEXT: kmovd %esi, %k3 +; SKX-NEXT: kxord %k0, %k3, %k0 +; SKX-NEXT: kshiftrd $2, %k0, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $29, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $3, %k0, %k2 +; SKX-NEXT: kmovd %r8d, %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $28, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $4, %k0, %k2 +; SKX-NEXT: kmovd %r9d, %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $27, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $5, %k0, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $26, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $6, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $25, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $7, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $24, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $8, %k0, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $23, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $9, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $22, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $10, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $21, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $11, %k0, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $20, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $12, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $19, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $13, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $18, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftrd $14, %k0, %k3 +; SKX-NEXT: kxord %k2, %k3, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $17, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $15, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $16, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $16, %k0, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $15, %k2, %k2 +; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kandd %k1, %k0, %k0 +; SKX-NEXT: kshiftrd $16, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kshiftrd $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kshiftrd $2, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r10d +; SKX-NEXT: kshiftrd $3, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r11d +; SKX-NEXT: kshiftrd $4, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r12d +; SKX-NEXT: kshiftrd $5, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r15d +; SKX-NEXT: kshiftrd $6, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r14d +; SKX-NEXT: kshiftrd $7, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r13d +; SKX-NEXT: kshiftrd $8, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ebx +; SKX-NEXT: kshiftrd $9, %k0, %k1 +; SKX-NEXT: kmovd %k1, %esi +; SKX-NEXT: kshiftrd $10, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ebp +; SKX-NEXT: kshiftrd $11, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ecx +; SKX-NEXT: kshiftrd $12, %k0, %k1 +; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kshiftrd $13, %k0, %k1 +; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kshiftrd $14, %k0, %k1 +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: movb %r8b, 2(%rax) +; SKX-NEXT: kmovd %k0, %r8d +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: leal (%r8,%r9,2), %r8d +; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kshiftrd $15, %k0, %k0 +; SKX-NEXT: andl $1, %r10d +; SKX-NEXT: leal (%r8,%r10,4), %r8d +; SKX-NEXT: kmovd %k0, %r10d +; SKX-NEXT: andl $1, %r11d +; SKX-NEXT: leal (%r8,%r11,8), %r8d +; SKX-NEXT: andl $1, %r12d +; SKX-NEXT: shll $4, %r12d +; SKX-NEXT: orl %r8d, %r12d +; SKX-NEXT: andl $1, %r15d +; SKX-NEXT: shll $5, %r15d +; SKX-NEXT: orl %r12d, %r15d +; SKX-NEXT: andl $1, %r14d +; SKX-NEXT: shll $6, %r14d +; SKX-NEXT: andl $1, %r13d +; SKX-NEXT: shll $7, %r13d +; SKX-NEXT: orl %r14d, %r13d +; SKX-NEXT: andl $1, %ebx +; SKX-NEXT: shll $8, %ebx +; SKX-NEXT: orl %r13d, %ebx +; SKX-NEXT: andl $1, %esi +; SKX-NEXT: shll $9, %esi +; SKX-NEXT: orl %ebx, %esi +; SKX-NEXT: andl $1, %ebp +; SKX-NEXT: shll $10, %ebp +; SKX-NEXT: orl %esi, %ebp +; SKX-NEXT: orl %r15d, %ebp +; SKX-NEXT: andl $1, %ecx +; SKX-NEXT: shll $11, %ecx +; SKX-NEXT: andl $1, %edx +; SKX-NEXT: shll $12, %edx +; SKX-NEXT: orl %ecx, %edx +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: shll $13, %edi +; SKX-NEXT: orl %edx, %edi +; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: shll $14, %r9d +; SKX-NEXT: orl %edi, %r9d +; SKX-NEXT: andl $1, %r10d +; SKX-NEXT: shll $15, %r10d +; SKX-NEXT: orl %r9d, %r10d +; SKX-NEXT: orl %ebp, %r10d +; SKX-NEXT: movw %r10w, (%rax) +; SKX-NEXT: popq %rbx +; SKX-NEXT: popq %r12 +; SKX-NEXT: popq %r13 +; SKX-NEXT: popq %r14 +; SKX-NEXT: popq %r15 +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test16: @@ -803,264 +1113,295 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: pushl %ebx ; KNL_X32-NEXT: pushl %edi ; KNL_X32-NEXT: pushl %esi -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $15, %k1, %k1 -; KNL_X32-NEXT: korw %k0, %k1, %k0 -; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $14, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 ; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: korw %k0, %k1, %k0 -; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kxorw %k1, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 +; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $1, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $15, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $12, %k1, %k1 -; KNL_X32-NEXT: korw %k0, %k1, %k0 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $1, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $15, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $4, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $1, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $15, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $1, %k3, %k3 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $15, %k4, %k4 -; KNL_X32-NEXT: korw %k3, %k4, %k3 -; KNL_X32-NEXT: kshiftlw $14, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $12, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kunpckbw %k0, %k1, %k0 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $1, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $15, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 ; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: kshiftlw $4, %k1, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $1, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $15, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k1, %k3, %k1 +; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 +; KNL_X32-NEXT: kxorw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $1, %k3, %k3 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $15, %k4, %k4 -; KNL_X32-NEXT: korw %k3, %k4, %k3 -; KNL_X32-NEXT: kshiftlw $14, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: korw %k1, %k2, %k1 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $1, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $15, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $1, %k3, %k3 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $15, %k4, %k4 -; KNL_X32-NEXT: korw %k3, %k4, %k3 -; KNL_X32-NEXT: kshiftlw $14, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $1, %k3, %k3 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $15, %k4, %k4 -; KNL_X32-NEXT: korw %k3, %k4, %k3 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kshiftlw $1, %k4, %k4 -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $15, %k5, %k5 -; KNL_X32-NEXT: korw %k4, %k5, %k4 -; KNL_X32-NEXT: kshiftlw $2, %k3, %k3 -; KNL_X32-NEXT: kshiftlw $14, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $14, %k4, %k4 -; KNL_X32-NEXT: korw %k3, %k4, %k3 -; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $12, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 -; KNL_X32-NEXT: korw %k2, %k3, %k2 -; KNL_X32-NEXT: kunpckbw %k1, %k2, %k1 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $9, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $10, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $11, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $12, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $13, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $14, %k1, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $1, %k1, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k1, %k1 +; KNL_X32-NEXT: xorl %eax, %eax +; KNL_X32-NEXT: cmpb $0, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl $65535, %ecx ## imm = 0xFFFF +; KNL_X32-NEXT: movl $0, %edx +; KNL_X32-NEXT: cmovnel %ecx, %edx +; KNL_X32-NEXT: cmpb $0, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: cmovnel %ecx, %eax ; KNL_X32-NEXT: kandw %k0, %k1, %k0 +; KNL_X32-NEXT: kmovw %edx, %k1 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kandw %k1, %k2, %k1 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $4, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 -; KNL_X32-NEXT: andl {{[0-9]+}}(%esp), %edx ; KNL_X32-NEXT: kmovw %k1, %ecx ; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 -; KNL_X32-NEXT: andl $1, %edx -; KNL_X32-NEXT: movb %dl, 2(%eax) -; KNL_X32-NEXT: kmovw %k0, %edx -; KNL_X32-NEXT: andl $1, %edx -; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: leal (%edx,%edi,2), %edx -; KNL_X32-NEXT: kmovw %k1, %edi -; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: leal (%edx,%ebx,4), %edx +; KNL_X32-NEXT: movb %bl, 2(%eax) +; KNL_X32-NEXT: kmovw %k0, %ebx +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: leal (%ebx,%esi,2), %esi ; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 +; KNL_X32-NEXT: andl $1, %edi +; KNL_X32-NEXT: leal (%esi,%edi,4), %esi +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp -; KNL_X32-NEXT: leal (%edx,%ebp,8), %edx +; KNL_X32-NEXT: leal (%esi,%ebp,8), %esi ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $4, %esi -; KNL_X32-NEXT: orl %edx, %esi -; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: andl $1, %edx +; KNL_X32-NEXT: shll $4, %edx +; KNL_X32-NEXT: orl %esi, %edx +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $5, %ecx -; KNL_X32-NEXT: orl %esi, %ecx -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: orl %edx, %ecx +; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 -; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $6, %edi ; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $7, %ebx -; KNL_X32-NEXT: orl %edi, %ebx -; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: shll $6, %ebx +; KNL_X32-NEXT: andl $1, %edi +; KNL_X32-NEXT: shll $7, %edi +; KNL_X32-NEXT: orl %ebx, %edi +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $8, %ebp -; KNL_X32-NEXT: orl %ebx, %ebp -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: orl %edi, %ebp +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 -; KNL_X32-NEXT: andl $1, %edx -; KNL_X32-NEXT: shll $9, %edx -; KNL_X32-NEXT: orl %ebp, %edx +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: shll $9, %esi +; KNL_X32-NEXT: orl %ebp, %esi ; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $10, %esi -; KNL_X32-NEXT: orl %edx, %esi -; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: andl $1, %edx +; KNL_X32-NEXT: shll $10, %edx +; KNL_X32-NEXT: orl %esi, %edx +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 -; KNL_X32-NEXT: orl %ecx, %esi +; KNL_X32-NEXT: orl %ecx, %edx ; KNL_X32-NEXT: kmovw %k0, %ecx -; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $11, %edi ; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $12, %ebx -; KNL_X32-NEXT: orl %edi, %ebx +; KNL_X32-NEXT: shll $11, %ebx +; KNL_X32-NEXT: andl $1, %edi +; KNL_X32-NEXT: shll $12, %edi +; KNL_X32-NEXT: orl %ebx, %edi ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $13, %ebp -; KNL_X32-NEXT: orl %ebx, %ebp -; KNL_X32-NEXT: andl $1, %edx -; KNL_X32-NEXT: shll $14, %edx -; KNL_X32-NEXT: orl %ebp, %edx +; KNL_X32-NEXT: orl %edi, %ebp +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: shll $14, %esi +; KNL_X32-NEXT: orl %ebp, %esi ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $15, %ecx -; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: orl %esi, %ecx +; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: popl %edi @@ -1070,3 +1411,1142 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { %c = and <17 x i1> %a, %b ret <17 x i1> %c } + +define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x i1>%e, <7 x i1>%f, <7 x i1> %g, <7 x i1> %h, <7 x i1> %i) nounwind { +; KNL-LABEL: test17: +; KNL: ## %bb.0: +; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: kxorw %k1, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k3 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kxorw %k1, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k3 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kxorw %k1, %k3, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k4 +; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k4 +; KNL-NEXT: kxorw %k1, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k5 +; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $9, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k5 +; KNL-NEXT: kxorw %k1, %k5, %k5 +; KNL-NEXT: kshiftrw $2, %k5, %k6 +; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k6 +; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $12, %k5, %k5 +; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k6 +; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $11, %k5, %k5 +; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k6 +; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k6 +; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $9, %k5, %k5 +; KNL-NEXT: kxorw %k5, %k0, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k6 +; KNL-NEXT: kxorw %k1, %k6, %k6 +; KNL-NEXT: kshiftrw $2, %k6, %k7 +; KNL-NEXT: kxorw %k0, %k7, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k6 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $12, %k6, %k6 +; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k6 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $11, %k6, %k6 +; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k6 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $10, %k6, %k6 +; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k6 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $9, %k6, %k6 +; KNL-NEXT: kxorw %k6, %k0, %k6 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kshiftrw $2, %k0, %k2 +; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kmovw %edx, %k7 +; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k7 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftrw $2, %k0, %k3 +; KNL-NEXT: kxorw %k2, %k3, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: kmovw %r8d, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kmovw %r9d, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftrw $2, %k1, %k3 +; KNL-NEXT: kxorw %k2, %k3, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kandw %k7, %k0, %k0 +; KNL-NEXT: kandw %k6, %k0, %k0 +; KNL-NEXT: kandw %k5, %k0, %k0 +; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k1 +; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kshiftrw $2, %k0, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: andb $1, %sil +; KNL-NEXT: andb $1, %dl +; KNL-NEXT: addb %dl, %dl +; KNL-NEXT: orb %sil, %dl +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: shlb $2, %cl +; KNL-NEXT: orb %dl, %cl +; KNL-NEXT: andb $1, %dil +; KNL-NEXT: shlb $3, %dil +; KNL-NEXT: orb %cl, %dil +; KNL-NEXT: andb $1, %r10b +; KNL-NEXT: shlb $4, %r10b +; KNL-NEXT: orb %dil, %r10b +; KNL-NEXT: andb $1, %r9b +; KNL-NEXT: shlb $5, %r9b +; KNL-NEXT: orb %r10b, %r9b +; KNL-NEXT: shlb $6, %r8b +; KNL-NEXT: orb %r9b, %r8b +; KNL-NEXT: andb $127, %r8b +; KNL-NEXT: movb %r8b, (%rax) +; KNL-NEXT: retq +; +; SKX-LABEL: test17: +; SKX: ## %bb.0: +; SKX-NEXT: movq %rdi, %rax +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: kxorb %k0, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k3 +; SKX-NEXT: kxorb %k1, %k3, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: kxorb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $3, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k2 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $1, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kxorb %k0, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k4 +; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k3 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k3 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $1, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kandb %k1, %k2, %k1 +; SKX-NEXT: kxorb %k0, %k4, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k4 +; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftrb $4, %k2, %k4 +; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $1, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k0, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k5 +; SKX-NEXT: kxorb %k3, %k5, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $3, %k3, %k5 +; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $6, %k3, %k5 +; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $1, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k0, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k6 +; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $5, %k4, %k6 +; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $2, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $1, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kandb %k3, %k4, %k3 +; SKX-NEXT: kandb %k2, %k3, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k0, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k5 +; SKX-NEXT: kxorb %k3, %k5, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $3, %k3, %k5 +; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $6, %k3, %k5 +; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $1, %k4, %k4 +; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kxorb %k0, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k6 +; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $5, %k4, %k6 +; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $2, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $1, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kandb %k3, %k4, %k3 +; SKX-NEXT: kxorb %k0, %k7, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k5 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $2, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $1, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kmovd %ecx, %k5 +; SKX-NEXT: kmovd %esi, %k6 +; SKX-NEXT: kxorb %k0, %k6, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k6 +; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k5 +; SKX-NEXT: kmovd %r8d, %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k5 +; SKX-NEXT: kmovd %r9d, %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $2, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k5 +; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $1, %k5, %k5 +; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kshiftrb $5, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kshiftrb $4, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r10d +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kshiftrb $2, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ecx +; SKX-NEXT: kshiftrb $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kmovd %k0, %esi +; SKX-NEXT: andb $1, %sil +; SKX-NEXT: andb $1, %dl +; SKX-NEXT: addb %dl, %dl +; SKX-NEXT: orb %sil, %dl +; SKX-NEXT: andb $1, %cl +; SKX-NEXT: shlb $2, %cl +; SKX-NEXT: orb %dl, %cl +; SKX-NEXT: andb $1, %dil +; SKX-NEXT: shlb $3, %dil +; SKX-NEXT: orb %cl, %dil +; SKX-NEXT: andb $1, %r10b +; SKX-NEXT: shlb $4, %r10b +; SKX-NEXT: orb %dil, %r10b +; SKX-NEXT: andb $1, %r9b +; SKX-NEXT: shlb $5, %r9b +; SKX-NEXT: orb %r10b, %r9b +; SKX-NEXT: shlb $6, %r8b +; SKX-NEXT: orb %r9b, %r8b +; SKX-NEXT: andb $127, %r8b +; SKX-NEXT: movb %r8b, (%rax) +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test17: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebx +; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 +; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_X32-NEXT: kxorw %k1, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 +; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kxorw %k1, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 +; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k1, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $2, %k3, %k4 +; KNL_X32-NEXT: kxorw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k3 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k3 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k3 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k3 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kxorw %k1, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $2, %k4, %k5 +; KNL_X32-NEXT: kxorw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k4 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 +; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k4 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 +; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k4 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 +; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k4 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $9, %k4, %k4 +; KNL_X32-NEXT: kxorw %k4, %k0, %k4 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kxorw %k1, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $2, %k5, %k6 +; KNL_X32-NEXT: kxorw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k5 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 +; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k5 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 +; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k5 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 +; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k5 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $9, %k5, %k5 +; KNL_X32-NEXT: kxorw %k5, %k0, %k5 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kxorw %k1, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $2, %k6, %k7 +; KNL_X32-NEXT: kxorw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: kxorw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 +; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 +; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 +; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 +; KNL_X32-NEXT: kxorw %k6, %k0, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl +; KNL_X32-NEXT: kmovw %ecx, %k0 +; KNL_X32-NEXT: kxorw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 +; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k7 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl +; KNL_X32-NEXT: kmovw %ecx, %k0 +; KNL_X32-NEXT: kxorw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k3 +; KNL_X32-NEXT: kxorw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kxorw %k1, %k2, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 +; KNL_X32-NEXT: kxorw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 +; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 +; KNL_X32-NEXT: kandw %k5, %k0, %k0 +; KNL_X32-NEXT: kandw %k4, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 +; KNL_X32-NEXT: kmovw %k1, %ecx +; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 +; KNL_X32-NEXT: kmovw %k1, %eax +; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 +; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 +; KNL_X32-NEXT: kmovw %k0, %ebx +; KNL_X32-NEXT: andb $1, %bl +; KNL_X32-NEXT: andb $1, %dl +; KNL_X32-NEXT: addb %dl, %dl +; KNL_X32-NEXT: orb %bl, %dl +; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 +; KNL_X32-NEXT: andb $1, %bl +; KNL_X32-NEXT: shlb $2, %bl +; KNL_X32-NEXT: orb %dl, %bl +; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 +; KNL_X32-NEXT: andb $1, %dl +; KNL_X32-NEXT: shlb $3, %dl +; KNL_X32-NEXT: orb %bl, %dl +; KNL_X32-NEXT: kmovw %k0, %ebx +; KNL_X32-NEXT: andb $1, %bl +; KNL_X32-NEXT: shlb $4, %bl +; KNL_X32-NEXT: orb %dl, %bl +; KNL_X32-NEXT: andb $1, %al +; KNL_X32-NEXT: shlb $5, %al +; KNL_X32-NEXT: orb %bl, %al +; KNL_X32-NEXT: shlb $6, %cl +; KNL_X32-NEXT: orb %al, %cl +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: andb $127, %cl +; KNL_X32-NEXT: movb %cl, (%eax) +; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: popl %ebx +; KNL_X32-NEXT: retl $4 + %j = and <7 x i1> %a, %b + %k = and <7 x i1> %j, %c + %l = and <7 x i1> %k, %d + %m = and <7 x i1> %l, %e + %n = and <7 x i1> %m, %f + %o = and <7 x i1> %n, %g + %p = and <7 x i1> %o, %h + %q = and <7 x i1> %p, %i + ret <7 x i1> %q +} diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index a899d1e..72757df 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1626,75 +1626,62 @@ declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, < define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { ; KNL_64-LABEL: test30: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 -; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL_64-NEXT: kshiftrw $1, %k0, %k1 -; KNL_64-NEXT: kshiftrw $2, %k0, %k2 -; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: andb $1, %al -; KNL_64-NEXT: kmovw %k1, %ecx -; KNL_64-NEXT: andb $1, %cl -; KNL_64-NEXT: addb %cl, %cl -; KNL_64-NEXT: orb %al, %cl -; KNL_64-NEXT: kmovw %k2, %eax -; KNL_64-NEXT: andb $1, %al -; KNL_64-NEXT: shlb $2, %al -; KNL_64-NEXT: orb %cl, %al +; KNL_64-NEXT: andb $1, %dil +; KNL_64-NEXT: andb $1, %sil +; KNL_64-NEXT: addb %sil, %sil +; KNL_64-NEXT: orb %dil, %sil +; KNL_64-NEXT: andb $1, %dl +; KNL_64-NEXT: shlb $2, %dl +; KNL_64-NEXT: orb %sil, %dl ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %dl ; KNL_64-NEXT: jne .LBB31_1 ; KNL_64-NEXT: # %bb.2: # %else -; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: testb $2, %dl ; KNL_64-NEXT: jne .LBB31_3 ; KNL_64-NEXT: .LBB31_4: # %else2 -; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: testb $4, %dl ; KNL_64-NEXT: jne .LBB31_5 ; KNL_64-NEXT: .LBB31_6: # %else5 -; KNL_64-NEXT: vmovdqa %xmm3, %xmm0 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; KNL_64-NEXT: .LBB31_1: # %cond.load -; KNL_64-NEXT: vmovq %xmm0, %rcx -; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3 -; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2 +; KNL_64-NEXT: testb $2, %dl ; KNL_64-NEXT: je .LBB31_4 ; KNL_64-NEXT: .LBB31_3: # %cond.load1 -; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx -; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3 -; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 +; KNL_64-NEXT: testb $4, %dl ; KNL_64-NEXT: je .LBB31_6 ; KNL_64-NEXT: .LBB31_5: # %cond.load4 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL_64-NEXT: vmovq %xmm0, %rax -; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 -; KNL_64-NEXT: vmovdqa %xmm3, %xmm0 +; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test30: ; KNL_32: # %bb.0: -; KNL_32-NEXT: subl $12, %esp -; KNL_32-NEXT: .cfi_def_cfa_offset 16 -; KNL_32-NEXT: vmovdqa %xmm0, %xmm3 -; KNL_32-NEXT: vpslld $31, %xmm2, %xmm0 -; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL_32-NEXT: kshiftrw $1, %k0, %k1 -; KNL_32-NEXT: kshiftrw $2, %k0, %k2 -; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: pushl %eax +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_32-NEXT: andb $1, %al -; KNL_32-NEXT: kmovw %k1, %ecx +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %cl ; KNL_32-NEXT: andb $1, %cl ; KNL_32-NEXT: addb %cl, %cl ; KNL_32-NEXT: orb %al, %cl -; KNL_32-NEXT: kmovw %k2, %eax +; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_32-NEXT: andb $1, %al ; KNL_32-NEXT: shlb $2, %al ; KNL_32-NEXT: orb %cl, %al -; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 -; KNL_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: jne .LBB31_1 ; KNL_32-NEXT: # %bb.2: # %else @@ -1702,102 +1689,86 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; KNL_32-NEXT: jne .LBB31_3 ; KNL_32-NEXT: .LBB31_4: # %else2 ; KNL_32-NEXT: testb $4, %al -; KNL_32-NEXT: jne .LBB31_5 +; KNL_32-NEXT: je .LBB31_6 +; KNL_32-NEXT: .LBB31_5: # %cond.load4 +; KNL_32-NEXT: vpextrd $2, %xmm0, %eax +; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2 ; KNL_32-NEXT: .LBB31_6: # %else5 -; KNL_32-NEXT: addl $12, %esp +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: popl %eax ; KNL_32-NEXT: .cfi_def_cfa_offset 4 -; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; KNL_32-NEXT: .LBB31_1: # %cond.load -; KNL_32-NEXT: .cfi_def_cfa_offset 16 -; KNL_32-NEXT: vmovd %xmm1, %ecx -; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 ; KNL_32-NEXT: testb $2, %al ; KNL_32-NEXT: je .LBB31_4 ; KNL_32-NEXT: .LBB31_3: # %cond.load1 -; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx -; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2 ; KNL_32-NEXT: testb $4, %al -; KNL_32-NEXT: je .LBB31_6 -; KNL_32-NEXT: .LBB31_5: # %cond.load4 -; KNL_32-NEXT: vpextrd $2, %xmm1, %eax -; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 -; KNL_32-NEXT: addl $12, %esp -; KNL_32-NEXT: .cfi_def_cfa_offset 4 -; KNL_32-NEXT: vzeroupper -; KNL_32-NEXT: retl +; KNL_32-NEXT: jne .LBB31_5 +; KNL_32-NEXT: jmp .LBB31_6 ; ; SKX-LABEL: test30: ; SKX: # %bb.0: -; SKX-NEXT: vpslld $31, %xmm2, %xmm2 -; SKX-NEXT: vpmovd2m %xmm2, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kshiftrb $2, %k0, %k2 -; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: andb $1, %al -; SKX-NEXT: kmovw %k1, %ecx -; SKX-NEXT: andb $1, %cl -; SKX-NEXT: addb %cl, %cl -; SKX-NEXT: orb %al, %cl -; SKX-NEXT: kmovw %k2, %eax -; SKX-NEXT: andb $1, %al -; SKX-NEXT: shlb $2, %al -; SKX-NEXT: orb %cl, %al +; SKX-NEXT: andb $1, %dil +; SKX-NEXT: andb $1, %sil +; SKX-NEXT: addb %sil, %sil +; SKX-NEXT: orb %dil, %sil +; SKX-NEXT: andb $1, %dl +; SKX-NEXT: shlb $2, %dl +; SKX-NEXT: orb %sil, %dl ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %dl ; SKX-NEXT: jne .LBB31_1 ; SKX-NEXT: # %bb.2: # %else -; SKX-NEXT: testb $2, %al +; SKX-NEXT: testb $2, %dl ; SKX-NEXT: jne .LBB31_3 ; SKX-NEXT: .LBB31_4: # %else2 -; SKX-NEXT: testb $4, %al +; SKX-NEXT: testb $4, %dl ; SKX-NEXT: jne .LBB31_5 ; SKX-NEXT: .LBB31_6: # %else5 -; SKX-NEXT: vmovdqa %xmm3, %xmm0 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; SKX-NEXT: .LBB31_1: # %cond.load -; SKX-NEXT: vmovq %xmm0, %rcx -; SKX-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3 -; SKX-NEXT: testb $2, %al +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2 +; SKX-NEXT: testb $2, %dl ; SKX-NEXT: je .LBB31_4 ; SKX-NEXT: .LBB31_3: # %cond.load1 -; SKX-NEXT: vpextrq $1, %xmm0, %rcx -; SKX-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3 -; SKX-NEXT: testb $4, %al +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 +; SKX-NEXT: testb $4, %dl ; SKX-NEXT: je .LBB31_6 ; SKX-NEXT: .LBB31_5: # %cond.load4 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vmovq %xmm0, %rax -; SKX-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 -; SKX-NEXT: vmovdqa %xmm3, %xmm0 +; SKX-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test30: ; SKX_32: # %bb.0: -; SKX_32-NEXT: subl $12, %esp -; SKX_32-NEXT: .cfi_def_cfa_offset 16 -; SKX_32-NEXT: vmovdqa %xmm0, %xmm3 -; SKX_32-NEXT: vpslld $31, %xmm2, %xmm0 -; SKX_32-NEXT: vpmovd2m %xmm0, %k0 -; SKX_32-NEXT: kshiftrb $1, %k0, %k1 -; SKX_32-NEXT: kshiftrb $2, %k0, %k2 -; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: pushl %eax +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al ; SKX_32-NEXT: andb $1, %al -; SKX_32-NEXT: kmovw %k1, %ecx +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %cl ; SKX_32-NEXT: andb $1, %cl ; SKX_32-NEXT: addb %cl, %cl ; SKX_32-NEXT: orb %al, %cl -; SKX_32-NEXT: kmovw %k2, %eax +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al ; SKX_32-NEXT: andb $1, %al ; SKX_32-NEXT: shlb $2, %al ; SKX_32-NEXT: orb %cl, %al -; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 -; SKX_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX_32-NEXT: testb $1, %al ; SKX_32-NEXT: jne .LBB31_1 ; SKX_32-NEXT: # %bb.2: # %else @@ -1805,28 +1776,27 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX_32-NEXT: jne .LBB31_3 ; SKX_32-NEXT: .LBB31_4: # %else2 ; SKX_32-NEXT: testb $4, %al -; SKX_32-NEXT: jne .LBB31_5 +; SKX_32-NEXT: je .LBB31_6 +; SKX_32-NEXT: .LBB31_5: # %cond.load4 +; SKX_32-NEXT: vpextrd $2, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2 ; SKX_32-NEXT: .LBB31_6: # %else5 -; SKX_32-NEXT: addl $12, %esp +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: popl %eax ; SKX_32-NEXT: .cfi_def_cfa_offset 4 ; SKX_32-NEXT: retl ; SKX_32-NEXT: .LBB31_1: # %cond.load -; SKX_32-NEXT: .cfi_def_cfa_offset 16 -; SKX_32-NEXT: vmovd %xmm1, %ecx -; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0 +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 ; SKX_32-NEXT: testb $2, %al ; SKX_32-NEXT: je .LBB31_4 ; SKX_32-NEXT: .LBB31_3: # %cond.load1 -; SKX_32-NEXT: vpextrd $1, %xmm1, %ecx -; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 +; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx +; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2 ; SKX_32-NEXT: testb $4, %al -; SKX_32-NEXT: je .LBB31_6 -; SKX_32-NEXT: .LBB31_5: # %cond.load4 -; SKX_32-NEXT: vpextrd $2, %xmm1, %eax -; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 -; SKX_32-NEXT: addl $12, %esp -; SKX_32-NEXT: .cfi_def_cfa_offset 4 -; SKX_32-NEXT: retl +; SKX_32-NEXT: jne .LBB31_5 +; SKX_32-NEXT: jmp .LBB31_6 %sext_ind = sext <3 x i32> %ind to <3 x i64> %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 1a6be01..8c7232c 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4913,12 +4913,24 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %edx, %k0 +; AVX512F-NEXT: andl $1, %esi +; AVX512F-NEXT: kmovw %esi, %k1 +; AVX512F-NEXT: kxorw %k0, %k0, %k2 +; AVX512F-NEXT: kshiftrw $1, %k2, %k2 +; AVX512F-NEXT: kshiftlw $1, %k2, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-NEXT: kxorw %k0, %k2, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: kxorw %k0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-NEXT: kmovw %ecx, %k2 +; AVX512F-NEXT: kxorw %k2, %k1, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} @@ -4927,24 +4939,48 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k0 -; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm1 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; AVX512VLDQ-NEXT: kmovw %edx, %k0 +; AVX512VLDQ-NEXT: kmovw %esi, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 +; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2 +; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2 +; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2 +; AVX512VLDQ-NEXT: korb %k1, %k2, %k1 +; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2 +; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0 +; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %ecx, %k2 +; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 +; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1 ; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VLBW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm1 {%k1} {z} -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512VLBW-NEXT: kmovd %edx, %k0 +; AVX512VLBW-NEXT: andl $1, %esi +; AVX512VLBW-NEXT: kmovw %esi, %k1 +; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2 +; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512VLBW-NEXT: korw %k1, %k2, %k1 +; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2 +; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0 +; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLBW-NEXT: kmovd %ecx, %k2 +; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) -- 2.7.4