From: Craig Topper Date: Wed, 2 Oct 2019 17:47:09 +0000 (+0000) Subject: [X86] Rewrite to the vXi1 subvector insertion code to not rely on the value of bits... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=74c7d6be2843b2040f5b8ca712ee715475369196;p=platform%2Fupstream%2Fllvm.git [X86] Rewrite to the vXi1 subvector insertion code to not rely on the value of bits that might be undef The previous code tried to do a trick where we would extract the subvector from the location we were inserting. Then xor that with the new value. Take the xored value and clear out the bits above the subvector size. Then shift that xored subvector to the insert location. And finally xor that with the original vector. Since the old subvector was used in both xors, this would leave just the new subvector at the inserted location. Since the surrounding bits had been zeroed no other bits of the original vector would be modified. Unfortunately, if the old subvector came from undef we might aggressively propagate the undef. Then we end up with the XORs not cancelling because they aren't using the same value for the two uses of the old subvector. @bkramer gave me a case that demonstrated this, but we haven't reduced it enough to make it easily readable to see what's happening. This patch uses a safer, but more costly approach. It isolate the bits above the insertion and bits below the insert point and ORs those together leaving 0 for the insertion location. Then widens the subvector with 0s in the upper bits, shifts it into position with 0s in the lower bits. Then we do another OR. Differential Revision: https://reviews.llvm.org/D68311 llvm-svn: 373495 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8c837df..466a33c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5769,23 +5769,35 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Move the current value of the bit to be replace to the lsbs. - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getTargetConstant(IdxVal, dl, MVT::i8)); - // Xor with the new bit. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); - // Shift to MSB, filling bottom bits with 0. + + // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; - Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - // Shift to the final position, filling upper bits with 0. + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); - // Xor with original vector leaving the new value. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + + // Isolate the bits below the insertion point. + unsigned LowShift = NumElems - IdxVal; + SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + + // Isolate the bits after the last inserted bit. + unsigned HighShift = IdxVal + SubVecNumElems; + SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + + // Now OR all 3 pieces together. + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); + SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); + // Reduce to original width if needed. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 5fb114b3..b13c27e 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -531,211 +531,256 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: pushq %r12 ; KNL-NEXT: pushq %rbx ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kmovw %esi, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k6 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k5 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k4 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %esi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: korw %k2, %k0, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 ; KNL-NEXT: xorl %ecx, %ecx ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) ; KNL-NEXT: movl $65535, %edx ## imm = 0xFFFF ; KNL-NEXT: movl $0, %esi ; KNL-NEXT: cmovnel %edx, %esi -; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 ; KNL-NEXT: cmovnel %edx, %ecx +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 ; KNL-NEXT: kmovw %k1, %r8d @@ -832,193 +877,294 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: pushq %r13 ; SKX-NEXT: pushq %r12 ; SKX-NEXT: pushq %rbx -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k0, %k0 -; SKX-NEXT: kshiftrd $30, %k0, %k0 -; SKX-NEXT: kxord %k0, %k2, %k2 -; SKX-NEXT: kshiftrd $2, %k2, %k3 -; SKX-NEXT: kxord %k1, %k3, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $29, %k1, %k1 -; SKX-NEXT: kxord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $31, %k0, %k1 +; SKX-NEXT: kshiftld $2, %k0, %k0 +; SKX-NEXT: kord %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 ; SKX-NEXT: kshiftrd $3, %k1, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $29, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $4, %k1, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k1, %k1 +; SKX-NEXT: kshiftrd $29, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $5, %k1, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k1, %k1 +; SKX-NEXT: kshiftrd $28, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $4, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $5, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $6, %k1, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k1, %k1 +; SKX-NEXT: kshiftrd $27, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $6, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $7, %k1, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k1, %k1 +; SKX-NEXT: kshiftrd $26, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k1, %k1 +; SKX-NEXT: kshiftrd $25, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $7, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $24, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $10, %k1, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k1, %k1 +; SKX-NEXT: kshiftrd $23, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $11, %k1, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k1, %k1 +; SKX-NEXT: kshiftrd $22, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $10, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $11, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $12, %k1, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k1, %k1 +; SKX-NEXT: kshiftrd $21, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $12, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $13, %k1, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k1, %k1 +; SKX-NEXT: kshiftrd $20, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k1, %k1 +; SKX-NEXT: kshiftrd $19, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $13, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $18, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k1, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k1, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $16, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kmovd %esi, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $31, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kmovd %edx, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $3, %k0, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k0, %k0 +; SKX-NEXT: kshiftrd $30, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k2 -; SKX-NEXT: kmovd %esi, %k3 -; SKX-NEXT: kxord %k0, %k3, %k0 -; SKX-NEXT: kshiftrd $2, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $29, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $3, %k0, %k2 -; SKX-NEXT: kmovd %r8d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $4, %k0, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k0, %k0 +; SKX-NEXT: kshiftrd $29, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k2 -; SKX-NEXT: kmovd %r9d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $5, %k0, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $6, %k0, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k0, %k0 +; SKX-NEXT: kshiftrd $27, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $5, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $6, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $7, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k0, %k0 +; SKX-NEXT: kshiftrd $26, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $8, %k0, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $8, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $9, %k0, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $23, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $10, %k0, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k0, %k0 +; SKX-NEXT: kshiftrd $23, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $22, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $11, %k0, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k0, %k0 +; SKX-NEXT: kshiftrd $22, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $12, %k0, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k0, %k0 +; SKX-NEXT: kshiftrd $21, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $11, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $12, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $13, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k0, %k0 +; SKX-NEXT: kshiftrd $20, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $14, %k0, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $14, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $15, %k0, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $17, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k0, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $16, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k0, %k0 +; SKX-NEXT: kshiftrd $16, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r8d @@ -1113,215 +1259,262 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: pushl %ebx ; KNL_X32-NEXT: pushl %edi ; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: subl $20, %esp ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 ; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $3, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $4, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $5, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $6, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $7, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k6 +; KNL_X32-NEXT: korw %k1, %k6, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k5 +; KNL_X32-NEXT: korw %k1, %k5, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k4 +; KNL_X32-NEXT: korw %k1, %k4, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $9, %k0, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $10, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $11, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $12, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 ; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k1 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $9, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $10, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $11, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $12, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k6, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $13, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k5, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k4, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $1, %k1, %k1 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: xorl %eax, %eax ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl $65535, %ecx ## imm = 0xFFFF ; KNL_X32-NEXT: movl $0, %edx ; KNL_X32-NEXT: cmovnel %ecx, %edx +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %bl +; KNL_X32-NEXT: kmovw %ebx, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %edx, %k1 ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: cmovnel %ecx, %eax -; KNL_X32-NEXT: kandw %k0, %k1, %k0 -; KNL_X32-NEXT: kmovw %edx, %k1 +; KNL_X32-NEXT: kmovw (%esp), %k2 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kandw %k1, %k2, %k1 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1403,6 +1596,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: orl %esi, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) +; KNL_X32-NEXT: addl $20, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: popl %edi ; KNL_X32-NEXT: popl %ebx @@ -1416,356 +1610,550 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL-LABEL: test17: ; KNL: ## %bb.0: ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $3, %k3, %k3 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $4, %k3, %k3 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $5, %k3, %k3 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $6, %k3, %k3 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k3 +; KNL-NEXT: kshiftlw $7, %k3, %k3 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: korw %k0, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $14, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $3, %k4, %k4 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $4, %k4, %k4 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $5, %k4, %k4 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kshiftlw $7, %k4, %k4 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k5 -; KNL-NEXT: kshiftrw $2, %k5, %k6 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: korw %k0, %k4, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $14, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $3, %k5, %k5 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $12, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $4, %k5, %k5 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $11, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $5, %k5, %k5 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $10, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $6, %k5, %k5 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k5 +; KNL-NEXT: kshiftlw $7, %k5, %k5 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 ; KNL-NEXT: kshiftrw $9, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k1, %k6, %k6 -; KNL-NEXT: kshiftrw $2, %k6, %k7 -; KNL-NEXT: kxorw %k0, %k7, %k0 +; KNL-NEXT: korw %k0, %k5, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $14, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $3, %k6, %k6 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $4, %k6, %k6 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $5, %k6, %k6 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $6, %k6, %k6 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $10, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k6 +; KNL-NEXT: kshiftlw $7, %k6, %k6 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: korw %k0, %k6, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil ; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kshiftrw $2, %k0, %k2 -; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $14, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $13, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $12, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $11, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $10, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $9, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k7 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k7 -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k1, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kandw %k7, %k0, %k0 ; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kandw %k3, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload @@ -1808,300 +2196,488 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-LABEL: test17: ; SKX: ## %bb.0: ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k0, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftlb $6, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $4, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftlb $5, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $3, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $6, %k1, %k2 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftlb $2, %k1, %k1 +; SKX-NEXT: kshiftrb $2, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kandb %k1, %k2, %k1 -; SKX-NEXT: kxorb %k0, %k4, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kshiftrb $4, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kandb %k2, %k3, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 -; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 ; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 -; SKX-NEXT: kxorb %k0, %k7, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kmovd %esi, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $7, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kmovd %edx, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k5 -; SKX-NEXT: kmovd %esi, %k6 -; SKX-NEXT: kxorb %k0, %k6, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $3, %k0, %k5 -; SKX-NEXT: kmovd %r8d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k0, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $4, %k0, %k5 -; SKX-NEXT: kmovd %r9d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $5, %k0, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: korb %k0, %k5, %k0 ; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kandb %k2, %k0, %k0 @@ -2144,362 +2720,557 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-LABEL: test17: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: pushl %ebx -; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: pushl %eax ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $2, %k3, %k4 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $3, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $5, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $6, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $7, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 ; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k3, %k3 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k1, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $2, %k4, %k5 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $14, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $3, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $4, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $6, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $7, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 ; KNL_X32-NEXT: kshiftrw $9, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k4 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k1, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $2, %k5, %k6 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $14, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $3, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $4, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $5, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $7, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 ; KNL_X32-NEXT: kshiftrw $9, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k5 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k1, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $2, %k6, %k7 -; KNL_X32-NEXT: kxorw %k0, %k7, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $14, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $3, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $4, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $5, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $6, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 ; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $3, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $13, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $4, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $12, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $5, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $11, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $6, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $10, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $7, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $9, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k7 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $13, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $12, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $11, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $10, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kandw %k7, %k0, %k0 ; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload -; KNL_X32-NEXT: kandw %k1, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 @@ -2537,7 +3308,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andb $127, %cl ; KNL_X32-NEXT: movb %cl, (%eax) -; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: addl $4, %esp ; KNL_X32-NEXT: popl %ebx ; KNL_X32-NEXT: retl $4 %j = and <7 x i1> %a, %b diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 20af819..fcb07a5 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1886,410 +1886,495 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: # %bb.0: -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k2, %k7 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k6, %k2, %k5 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k4 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k5, %k2, %k4 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k4, %k2, %k3 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k0, %k4 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k4} {z} +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k2 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k3} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm5, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm6, %ymm4 ; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm7, %ymm4 ; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 ; KNL-NEXT: retq ; @@ -2304,410 +2389,495 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; ; AVX512DQNOBW-LABEL: test21: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: kmovw %edx, %k0 -; AVX512DQNOBW-NEXT: kmovw %edi, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQNOBW-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kmovw %esi, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQNOBW-NEXT: kmovw %edx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k3, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %ecx, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r8d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r9d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k4, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k5, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k6 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k6, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k7, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k0, %k1, %k0 +; AVX512DQNOBW-NEXT: korw %k0, %k2, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQNOBW-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQNOBW-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQNOBW-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k6, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k1, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k3, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k4, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k5, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k6, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k6, %k1, %k5 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k5, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k5, %k1, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k4, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k4, %k1, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k3, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 ; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm6, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm7, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 ; AVX512DQNOBW-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index d372202..6e36bd1 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -302,12 +302,15 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $10, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $ax killed $ax killed $eax ; KNL-NEXT: retq @@ -316,11 +319,14 @@ define i16 @test16(i1 *%addr, i16 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrw $10, %k1, %k2 -; SKX-NEXT: kxorw %k0, %k2, %k0 +; SKX-NEXT: kshiftrw $11, %k1, %k2 +; SKX-NEXT: kshiftlw $11, %k2, %k2 +; SKX-NEXT: kshiftlw $6, %k1, %k1 +; SKX-NEXT: kshiftrw $6, %k1, %k1 ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $5, %k0, %k0 -; SKX-NEXT: kxorw %k0, %k1, %k0 +; SKX-NEXT: korw %k0, %k2, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq @@ -336,12 +342,15 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: retq @@ -350,11 +359,14 @@ define i8 @test17(i1 *%addr, i8 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k0, %k2, %k0 +; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k1, %k0 +; SKX-NEXT: korb %k0, %k2, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq @@ -790,12 +802,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: orl %ecx, %eax ; KNL-NEXT: vzeroupper @@ -808,12 +823,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k1 +; SKX-NEXT: kshiftrd $5, %k0, %k1 +; SKX-NEXT: kshiftld $5, %k1, %k1 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 ; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $27, %k1, %k1 -; SKX-NEXT: kxord %k1, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $27, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -832,12 +850,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -848,12 +869,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k1 -; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index ea9742a..19f9a2a 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1069,12 +1069,16 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: kxnorw %k0, %k0, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: movb $1, %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1091,24 +1095,32 @@ define <64 x i8> @test16(i64 %x) { ; SKX-LABEL: test16: ; SKX: ## %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: movb $1, %al +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test16: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1116,27 +1128,31 @@ define <64 x i8> @test16(i64 %x) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQ-NEXT: movb $1, %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1145,12 +1161,16 @@ define <64 x i8> @test16(i64 %x) { ; X86-LABEL: test16: ; X86: ## %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 -; X86-NEXT: kshiftrq $5, %k0, %k1 -; X86-NEXT: kxnorw %k0, %k0, %k2 -; X86-NEXT: kxorq %k2, %k1, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: movb $1, %al +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1174,12 +1194,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1198,12 +1221,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; @@ -1212,12 +1238,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi ; AVX512BW-NEXT: setg %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1225,29 +1254,32 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 ; AVX512DQ-NEXT: cmpl %edx, %esi ; AVX512DQ-NEXT: setg %al -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1259,12 +1291,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al -; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kshiftrq $5, %k0, %k2 -; X86-NEXT: kxorq %k1, %k2, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1281,10 +1316,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k3 ; KNL-NEXT: kshiftlw $6, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $9, %k0, %k0 ; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: kshiftlw $7, %k2, %k1 @@ -1301,10 +1338,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; SKX-NEXT: kmovd %esi, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: kshiftrw $9, %k1, %k1 -; SKX-NEXT: kshiftrb $6, %k0, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k3 ; SKX-NEXT: kshiftlb $6, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k0, %k0 +; SKX-NEXT: korb %k1, %k3, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 @@ -1318,10 +1357,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 -; AVX512BW-NEXT: kxorw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $6, %k1, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 @@ -1337,10 +1378,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 ; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: kshiftrb $6, %k0, %k3 -; AVX512DQ-NEXT: kxorb %k1, %k3, %k1 +; AVX512DQ-NEXT: kshiftlb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k3 ; AVX512DQ-NEXT: kshiftlb $6, %k1, %k1 -; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: korb %k1, %k3, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 @@ -1357,10 +1400,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kshiftrw $8, %k1, %k2 ; X86-NEXT: kshiftrw $9, %k1, %k1 -; X86-NEXT: kshiftrb $6, %k0, %k3 -; X86-NEXT: kxorb %k1, %k3, %k1 +; X86-NEXT: kshiftlb $7, %k0, %k3 +; X86-NEXT: kshiftlb $2, %k0, %k0 +; X86-NEXT: kshiftrb $2, %k0, %k0 ; X86-NEXT: kshiftlb $6, %k1, %k1 -; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: korb %k1, %k3, %k1 +; X86-NEXT: korb %k1, %k0, %k0 ; X86-NEXT: kshiftlb $1, %k0, %k0 ; X86-NEXT: kshiftrb $1, %k0, %k0 ; X86-NEXT: kshiftlb $7, %k2, %k1 @@ -2748,403 +2793,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; KNL-LABEL: store_64i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 ; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k2, %k7, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: korw %k6, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k6, %k1, %k5 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k5, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k5, %k1, %k4 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: korw %k4, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k4, %k1, %k3 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k3, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kmovw %k1, 6(%rdi) -; KNL-NEXT: kmovw %k3, 4(%rdi) -; KNL-NEXT: kmovw %k2, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 4(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: retq ; @@ -3166,403 +3296,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; AVX512DQ-LABEL: store_64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovw %ecx, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQ-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k0 ; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: kmovw %edx, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k3, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r8d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r9d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQ-NEXT: kmovw %r8d, %k1 +; AVX512DQ-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k4, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQ-NEXT: kmovw %r9d, %k1 +; AVX512DQ-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k5, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k6 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k6, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k7, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k0, %k1, %k0 +; AVX512DQ-NEXT: korw %k0, %k2, %k0 ; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQ-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQ-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQ-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQ-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQ-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k6, %k7 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k1, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k3, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k4, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k5, %k6 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k6, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k6 +; AVX512DQ-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k6, %k1, %k5 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k5, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k5, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k4, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k4, %k1, %k3 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k3, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 ; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k1, %k1 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: kmovw %k1, 6(%rdi) -; AVX512DQ-NEXT: kmovw %k3, 4(%rdi) -; AVX512DQ-NEXT: kmovw %k2, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 8c7232c..164826d 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4913,24 +4913,30 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: kmovw %edx, %k0 ; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k1 -; AVX512F-NEXT: kxorw %k0, %k0, %k2 -; AVX512F-NEXT: kshiftrw $1, %k2, %k2 -; AVX512F-NEXT: kshiftlw $1, %k2, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k2 -; AVX512F-NEXT: kxorw %k0, %k2, %k0 +; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kxorw %k0, %k0, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k1 ; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: kshiftlw $15, %k2, %k2 +; AVX512F-NEXT: kshiftrw $14, %k2, %k2 +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-NEXT: kshiftlw $3, %k1, %k1 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kxorw %k0, %k1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: kxorw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 -; AVX512F-NEXT: kxorw %k1, %k0, %k0 +; AVX512F-NEXT: korw %k0, %k1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} @@ -4939,48 +4945,60 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: kmovw %edx, %k0 -; AVX512VLDQ-NEXT: kmovw %esi, %k1 -; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2 -; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2 -; AVX512VLDQ-NEXT: korb %k1, %k2, %k1 -; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2 -; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0 +; AVX512VLDQ-NEXT: kmovw %esi, %k0 ; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 -; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k0, %k1, %k0 ; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %ecx, %k2 -; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %edx, %k2 +; AVX512VLDQ-NEXT: kshiftlb $7, %k2, %k2 +; AVX512VLDQ-NEXT: kshiftrb $6, %k2, %k2 +; AVX512VLDQ-NEXT: korb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftlb $3, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 +; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %ecx, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLDQ-NEXT: korw %k0, %k1, %k1 ; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: kmovd %edx, %k0 ; AVX512VLBW-NEXT: andl $1, %esi -; AVX512VLBW-NEXT: kmovw %esi, %k1 -; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512VLBW-NEXT: korw %k1, %k2, %k1 -; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2 -; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512VLBW-NEXT: kmovw %esi, %k0 +; AVX512VLBW-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k0 +; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $2, %k1, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %edx, %k2 +; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512VLBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512VLBW-NEXT: korw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0 -; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLBW-NEXT: kmovd %ecx, %k2 -; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %ecx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index b809e55..436e487 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1730,20 +1730,26 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: smulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi ; AVX512-NEXT: imulq %rdx, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: seto %dl ; AVX512-NEXT: imulq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %edx, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -2197,46 +2203,76 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 ; AVX512-NEXT: kmovd %k1, %r9d ; AVX512-NEXT: andb $1, %r9b ; AVX512-NEXT: negb %r9b -; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: negb %r10b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k1, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: negb %r11b +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: negb %al +; AVX512-NEXT: kshiftrw $1, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %edx ; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: negb %dl +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: andb $1, %al +; AVX512-NEXT: negb %al +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %dl +; AVX512-NEXT: imulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %r8b, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: movl %r8d, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %r8b, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: imulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: imulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %dl, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: imulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -2246,26 +2282,22 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: imulb %r10b +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: imulb %r9b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %al, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2273,21 +2305,34 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 07899d0..c859ce7 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1532,21 +1532,28 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: umulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovq %xmm1, %rdx ; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: seto %r9b ; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %rsi -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %r9d, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -1945,6 +1952,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 @@ -1956,40 +1964,60 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: kshiftrw $2, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil -; AVX512-NEXT: kshiftrw $2, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: kmovd %k1, %ecx ; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: kmovd %k0, %edx -; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %dl +; AVX512-NEXT: mulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: mulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: mulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: testb $-2, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: mulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: movl %r9d, %eax @@ -1997,8 +2025,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2006,21 +2034,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0