From: Craig Topper Date: Tue, 17 Sep 2019 18:02:52 +0000 (+0000) Subject: [X86] Call SimplifyDemandedVectorElts on KSHIFTL/KSHIFTR nodes during DAG combine. X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f1ba94ade0b60470e076e27ad6dd6233441b3377;p=platform%2Fupstream%2Fllvm.git [X86] Call SimplifyDemandedVectorElts on KSHIFTL/KSHIFTR nodes during DAG combine. llvm-svn: 372154 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 55024c3..4a6aeb7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45107,6 +45107,20 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -45247,6 +45261,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); + case X86ISD::KSHIFTL: + case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 167a822..87fee83 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1886,332 +1886,311 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: # %bb.0: -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k2 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k1, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kxorw %k0, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k3 +; KNL-NEXT: kxorw %k1, %k3, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %r8d, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: kmovw %r9d, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: kmovw %ecx, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: kmovw %r8d, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kmovw %r9d, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $1, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k0, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k3, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k3 +; KNL-NEXT: kxorw %k0, %k3, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k4 +; KNL-NEXT: kxorw %k2, %k4, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kxorw %k2, %k3, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k3 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $1, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kshiftrw $1, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k0, %k3 +; KNL-NEXT: korw %k3, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftrw $1, %k4, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k4, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k4 +; KNL-NEXT: kxorw %k0, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k5 +; KNL-NEXT: kxorw %k3, %k5, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: kxorw %k3, %k4, %k3 +; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $5, %k3, %k4 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 ; KNL-NEXT: kxorw %k5, %k4, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $9, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $8, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $7, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $6, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $5, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $4, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $3, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $1, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $1, %k3, %k3 +; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k5 +; KNL-NEXT: kxorw %k4, %k5, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $13, %k4, %k4 ; KNL-NEXT: kxorw %k4, %k0, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k4 @@ -2329,228 +2308,113 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; ; AVX512DQNOBW-LABEL: test21: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: kmovw %esi, %k0 -; AVX512DQNOBW-NEXT: kmovw %edi, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k2 -; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512DQNOBW-NEXT: kxorw %k0, %k1, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k1 -; AVX512DQNOBW-NEXT: kmovw %edx, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k1 -; AVX512DQNOBW-NEXT: kmovw %ecx, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k1 -; AVX512DQNOBW-NEXT: kmovw %r8d, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k1 -; AVX512DQNOBW-NEXT: kmovw %r9d, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k3 -; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQNOBW-NEXT: kmovw %edx, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k2 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k2 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 +; AVX512DQNOBW-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k2 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k2 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 ; AVX512DQNOBW-NEXT: kxorw %k2, %k4, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 ; AVX512DQNOBW-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k3 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k4 @@ -2645,18 +2509,12 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: kmovw %eax, %k3 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k4, %k5 +; AVX512DQNOBW-NEXT: kxorw %k1, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k5 ; AVX512DQNOBW-NEXT: kxorw %k3, %k5, %k3 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k3 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 ; AVX512DQNOBW-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 ; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k4 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k5 @@ -2747,13 +2605,113 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; AVX512DQNOBW-NEXT: kmovw %eax, %k4 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k1, %k5, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k5 +; AVX512DQNOBW-NEXT: kxorw %k4, %k5, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k5 +; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k4, %k4 +; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQNOBW-NEXT: korw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 +; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 +; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 ; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 4b34ada..3e59e6e 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -2753,229 +2753,114 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; KNL-LABEL: store_64i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: kmovw %esi, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k2 -; KNL-NEXT: kxorw %k0, %k2, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k1, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %r8d, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %r9d, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kmovw %ecx, %k0 +; KNL-NEXT: kmovw %esi, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k1, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k3 +; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k2 +; KNL-NEXT: kmovw %r8d, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kmovw %r9d, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $1, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k4 +; KNL-NEXT: kxorw %k1, %k3, %k3 +; KNL-NEXT: kshiftrw $2, %k3, %k4 ; KNL-NEXT: kxorw %k2, %k4, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 ; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 ; KNL-NEXT: kshiftrw $3, %k2, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 @@ -3070,18 +2955,12 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftrw $1, %k4, %k5 +; KNL-NEXT: kxorw %k1, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k5 ; KNL-NEXT: kxorw %k3, %k5, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 ; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 ; KNL-NEXT: kshiftrw $3, %k3, %k4 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 @@ -3172,9 +3051,109 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: korw %k4, %k3, %k3 -; KNL-NEXT: kmovw %k3, 6(%rdi) -; KNL-NEXT: kmovw %k2, 4(%rdi) -; KNL-NEXT: kmovw %k1, 2(%rdi) +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k1, %k5, %k1 +; KNL-NEXT: kshiftrw $2, %k1, %k5 +; KNL-NEXT: kxorw %k4, %k5, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $9, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $8, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $7, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $6, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $5, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $4, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $3, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $2, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $1, %k4, %k4 +; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: korw %k4, %k1, %k1 +; KNL-NEXT: kmovw %k1, 6(%rdi) +; KNL-NEXT: kmovw %k3, 4(%rdi) +; KNL-NEXT: kmovw %k2, 2(%rdi) ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: retq ; @@ -3196,229 +3175,114 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; AVX512DQ-LABEL: store_64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovw %edx, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 -; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 -; AVX512DQ-NEXT: kxorw %k0, %k1, %k0 -; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512DQ-NEXT: kmovw %ecx, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $13, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $3, %k0, %k1 -; AVX512DQ-NEXT: kmovw %r8d, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $12, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512DQ-NEXT: kmovw %r9d, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $10, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $7, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $7, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $9, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $6, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $5, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $11, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $4, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $3, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $13, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $2, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 -; AVX512DQ-NEXT: korw %k1, %k0, %k0 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: kshiftrw $1, %k2, %k3 -; AVX512DQ-NEXT: kxorw %k1, %k3, %k1 -; AVX512DQ-NEXT: kshiftlw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 ; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k2, %k1 -; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k1, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 +; AVX512DQ-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 +; AVX512DQ-NEXT: kmovw %r8d, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k2 +; AVX512DQ-NEXT: kmovw %r9d, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k1, %k1 -; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: korw %k2, %k1, %k1 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQ-NEXT: kxorw %k1, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 ; AVX512DQ-NEXT: kxorw %k2, %k4, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 ; AVX512DQ-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k4 @@ -3513,18 +3377,12 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftrw $1, %k4, %k5 +; AVX512DQ-NEXT: kxorw %k1, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 ; AVX512DQ-NEXT: kxorw %k3, %k5, %k3 ; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $14, %k3, %k3 +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 ; AVX512DQ-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 ; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k5 @@ -3615,9 +3473,109 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; AVX512DQ-NEXT: kmovw %eax, %k4 ; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 ; AVX512DQ-NEXT: korw %k4, %k3, %k3 -; AVX512DQ-NEXT: kmovw %k3, 6(%rdi) -; AVX512DQ-NEXT: kmovw %k2, 4(%rdi) -; AVX512DQ-NEXT: kmovw %k1, 2(%rdi) +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k1, %k5, %k1 +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k5 +; AVX512DQ-NEXT: kxorw %k4, %k5, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: kshiftrw $1, %k4, %k4 +; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, 6(%rdi) +; AVX512DQ-NEXT: kmovw %k3, 4(%rdi) +; AVX512DQ-NEXT: kmovw %k2, 2(%rdi) ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 025934d..a554ad0 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1730,24 +1730,20 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: smulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: vmovq %xmm1, %rdx -; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi ; AVX512-NEXT: imulq %rdx, %rsi -; AVX512-NEXT: seto %dl +; AVX512-NEXT: vmovq %rsi, %xmm0 ; AVX512-NEXT: imulq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kmovd %edx, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -2201,73 +2197,46 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 ; AVX512-NEXT: kmovd %k1, %r9d ; AVX512-NEXT: andb $1, %r9b ; AVX512-NEXT: negb %r9b -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: negb %r10b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %r11d -; AVX512-NEXT: andb $1, %r11b -; AVX512-NEXT: negb %r11b -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: negb %bl -; AVX512-NEXT: kshiftrw $1, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %esi +; AVX512-NEXT: kmovd %k1, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: kmovd %k0, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil -; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl ; AVX512-NEXT: kmovd %k1, %eax ; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: negb %al -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %cl +; AVX512-NEXT: imulb %dl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: cmpb %r8b, %cl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl -; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k1 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: imulb %sil -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %edx, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: cmpb %dl, %cl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: movl %r8d, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl +; AVX512-NEXT: cmpb %r8b, %dl +; AVX512-NEXT: setne %dl +; AVX512-NEXT: orb %al, %dl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: imulb %bl +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: imulb %cl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -2278,38 +2247,37 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k2, %k2 +; AVX512-NEXT: kshiftrw $2, %k2, %k3 +; AVX512-NEXT: kxorw %k1, %k3, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlw $13, %k0, %k0 -; AVX512-NEXT: kshiftrw $13, %k0, %k0 -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: imulb %r9b +; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kshiftlw $13, %k1, %k1 +; AVX512-NEXT: kshiftrw $13, %k1, %k1 +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: imulb %r10b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: negb %bl -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: setne %bl -; AVX512-NEXT: orb %cl, %bl +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl +; AVX512-NEXT: cmpb %al, %dl +; AVX512-NEXT: setne %dl +; AVX512-NEXT: orb %cl, %dl ; AVX512-NEXT: setne %cl -; AVX512-NEXT: kmovd %ecx, %k1 -; AVX512-NEXT: kshiftlw $3, %k1, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: kmovd %ecx, %k2 +; AVX512-NEXT: kshiftlw $3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: kmovd %r8d, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k1 -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: kmovd %r8d, %k1 ; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kxorw %k0, %k2, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k0 @@ -2321,7 +2289,6 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kxorw %k1, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 61c470a..dfe720f 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1532,26 +1532,21 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: umulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %r8 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %rsi +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm1, %rdx ; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: seto %r9b -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %r8 ; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: mulq %rsi +; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kmovd %r9d, %k1 -; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -1950,7 +1945,6 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 @@ -1962,47 +1956,26 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %r11d -; AVX512-NEXT: andb $1, %r11b -; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: kshiftrw $1, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %esi +; AVX512-NEXT: kmovd %k0, %esi ; AVX512-NEXT: andb $1, %sil -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $1, %al +; AVX512-NEXT: kshiftrw $2, %k1, %k0 ; AVX512-NEXT: kmovd %k1, %ecx ; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: andb $1, %al +; AVX512-NEXT: kmovd %k0, %edx +; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %cl +; AVX512-NEXT: mulb %dl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl -; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k1 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: mulb %sil -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: seto %al -; AVX512-NEXT: testb $-2, %dl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %dl +; AVX512-NEXT: orb %al, %dl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: mulb %bl +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: mulb %cl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil @@ -2010,35 +1983,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k2, %k2 +; AVX512-NEXT: kshiftrw $2, %k2, %k3 +; AVX512-NEXT: kxorw %k1, %k3, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftlw $13, %k0, %k0 -; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kshiftlw $13, %k1, %k1 +; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: movl %r9d, %eax ; AVX512-NEXT: mulb %r10b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %bl -; AVX512-NEXT: orb %cl, %bl +; AVX512-NEXT: setne %dl +; AVX512-NEXT: orb %cl, %dl ; AVX512-NEXT: setne %cl -; AVX512-NEXT: kmovd %ecx, %k1 -; AVX512-NEXT: kshiftlw $3, %k1, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: kmovd %ecx, %k2 +; AVX512-NEXT: kshiftlw $3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: kmovd %r8d, %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k1 -; AVX512-NEXT: kmovd %edx, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 -; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k1 +; AVX512-NEXT: kmovd %r8d, %k1 ; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kxorw %k0, %k2, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k0 @@ -2050,7 +2022,6 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kxorw %k1, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0