From a6ccc8d365fadcac1e5ecfe6c3ac78c3aa7f9b61 Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Thu, 25 Aug 2016 21:55:41 +0000 Subject: [PATCH] Revert r274613 because it breaks the test suite with AVX512 This reverts most of r274613 and its follow-ups (r276347, r277289), due to miscompiles in the test suite. The FastISel change was left in, because it apparently fixes an unrelated issue. This fixes 4 out of the 5 test failures in PR29112. llvm-svn: 279782 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 49 +-- llvm/lib/Target/X86/X86InstrAVX512.td | 92 ++---- llvm/test/CodeGen/X86/avx512-cmp.ll | 2 + llvm/test/CodeGen/X86/avx512-ext.ll | 246 +++++++------- llvm/test/CodeGen/X86/avx512-insert-extract.ll | 243 ++++++++++++++ llvm/test/CodeGen/X86/avx512-intrinsics.ll | 20 +- llvm/test/CodeGen/X86/avx512-mask-op.ll | 23 +- llvm/test/CodeGen/X86/avx512dq-intrinsics.ll | 8 +- llvm/test/CodeGen/X86/fast-isel-select-cmov.ll | 3 + llvm/test/CodeGen/X86/masked_gather_scatter.ll | 426 +++++++++++++++++------- llvm/test/CodeGen/X86/masked_memop.ll | 438 ++++++++++++++++++------- llvm/test/CodeGen/X86/pr27591.ll | 19 +- llvm/test/CodeGen/X86/pr28173.ll | 21 +- llvm/test/CodeGen/X86/xaluo.ll | 2 +- 14 files changed, 1112 insertions(+), 480 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d92ebb9..e39feaf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15998,7 +15998,7 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); if (Op.getSimpleValueType() == MVT::i1) - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return SetCC; } @@ -16028,18 +16028,14 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -/// Returns the "condition" node, that may be wrapped with "truncate". -/// Like this: (i1 (trunc (i8 X86ISD::SETCC))). -static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { +static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) - return V; + return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); - if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits))) - return V.getOperand(0); - return V; + return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -16262,7 +16258,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look past the truncate if the high bits are known zero. - Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -17098,7 +17095,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look pass the truncate if the high bits are known zero. - Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result is compared against zero. Try to match it to BT. if (Cond.hasOneUse()) { @@ -18349,7 +18347,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -20891,12 +20889,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), DAG.getConstant(X86::COND_O, DL, MVT::i32), SDValue(Sum.getNode(), 2)); - if (N->getValueType(1) == MVT::i1) - SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } @@ -20906,12 +20902,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), DAG.getConstant(Cond, DL, MVT::i32), SDValue(Sum.getNode(), 1)); - if (N->getValueType(1) == MVT::i1) + if (N->getValueType(1) == MVT::i1) { + SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, + DAG.getValueType(MVT::i1)); SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + } return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } @@ -30968,12 +30967,18 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS, - SelectionDAG &DAG) { - return DAG.getNode(ISD::AND, DL, MVT::i8, + SelectionDAG &DAG, MVT VT) { + if (VT == MVT::i8) + return DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + EFLAGS), + DAG.getConstant(1, DL, VT)); + assert (VT == MVT::i1 && "Unexpected type for SECCC node"); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS), - DAG.getConstant(1, DL, MVT::i8)); + EFLAGS)); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT @@ -30998,7 +31003,7 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - return MaterializeSETB(DL, NewEFLAGS, DAG); + return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); } } @@ -31006,7 +31011,7 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. if (CC == X86::COND_B) - return MaterializeSETB(DL, EFLAGS, DAG); + return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 3b5d259..324f27c 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2170,91 +2170,51 @@ let Predicates = [HasBWI] in { (KMOVQkm addr:$src)>; } -def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{ - return cast(N->getOperand(1))->getVT() == MVT::i1; -}]>; - -def trunc_setcc : PatFrag<(ops node:$src), (trunc node:$src), [{ - return (N->getOperand(0)->getOpcode() == X86ISD::SETCC); -}]>; - -def trunc_mask_1 : PatFrag<(ops node:$src), (trunc node:$src), [{ - return (N->getOperand(0)->getOpcode() == ISD::AND && - isa(N->getOperand(0)->getOperand(1)) && - N->getOperand(0)->getConstantOperandVal(1) == 1); -}]>; - - let Predicates = [HasAVX512] in { def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)), - sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc_mask_1 GR64:$src)), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; + (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)), - sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc_mask_1 GR32:$src)), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; + (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>; def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri $src, (i8 1)), - sub_8bit)), VK1)>; - - def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>; - - def : Pat<(i1 (trunc_setcc GR8:$src)), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>; - - def : Pat<(i1 (trunc_mask_1 GR8:$src)), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>; - + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))), + VK1)>; def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS (AND16ri GR16:$src, (i16 1)), VK1)>; - - def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))), - (COPY_TO_REGCLASS $src, VK1)>; - - def : Pat<(i1 (trunc_mask_1 GR16:$src)), - (COPY_TO_REGCLASS $src, VK1)>; + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))), + VK1)>; def : Pat<(i32 (zext VK1:$src)), - (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; - + (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), - (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; def : Pat<(i8 (zext VK1:$src)), - (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>; - + (EXTRACT_SUBREG + (AND32ri8 (KMOVWrk + (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; def : Pat<(i8 (anyext VK1:$src)), - (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>; + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>; def : Pat<(i64 (zext VK1:$src)), - (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; - + (AND64ri8 (SUBREG_TO_REG (i64 0), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; def : Pat<(i64 (anyext VK1:$src)), - (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (SUBREG_TO_REG (i64 0), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit)>; def : Pat<(i16 (zext VK1:$src)), - (COPY_TO_REGCLASS $src, GR16)>; - + (EXTRACT_SUBREG + (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), + sub_16bit)>; def : Pat<(i16 (anyext VK1:$src)), - (i16 (COPY_TO_REGCLASS $src, GR16))>; + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_16bit)>; } def : Pat<(v16i1 (scalar_to_vector VK1:$src)), (COPY_TO_REGCLASS VK1:$src, VK16)>; diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll index fceb9c1..78df51b 100644 --- a/llvm/test/CodeGen/X86/avx512-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp.ll @@ -163,10 +163,12 @@ define i32 @test10(i64 %b, i64 %c, i1 %d) { ; ALL-NEXT: kmovw %edx, %k0 ; ALL-NEXT: cmpq %rsi, %rdi ; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax ; ALL-NEXT: kmovw %eax, %k1 ; ALL-NEXT: korw %k1, %k0, %k1 ; ALL-NEXT: kxorw %k1, %k0, %k0 ; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: andl $1, %eax ; ALL-NEXT: testb %al, %al ; ALL-NEXT: je LBB8_1 ; ALL-NEXT: ## BB#2: ## %if.end.i diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 9ba5391..e45dda1 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1525,260 +1525,260 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r15d, %xmm4 -; KNL-NEXT: kmovw %k0, %r15d +; KNL-NEXT: vmovd %r13d, %xmm4 +; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %r12d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: vpinsrb $3, %r11d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %r13d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $4, %r8d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload -; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: vpinsrb $5, %edi, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %esi, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %esi -; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %edi, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $7, %esi, %xmm4, %xmm4 ; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %r8d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: vpinsrb $8, %ebx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %r9d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: vpinsrb $9, %ebp, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %r10d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: vpinsrb $10, %r14d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %r11d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: vpinsrb $11, %r15d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $12, %ebx, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: vpinsrb $12, %r12d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edi ; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $13, %ebp, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: vpinsrb $13, %r10d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $14, %r14d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: vpinsrb $14, %r9d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r9d ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $15, %r15d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r14d +; KNL-NEXT: vpinsrb $15, %r13d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %eax, %xmm5 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: vmovd %ecx, %xmm5 +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $2, %r12d, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $2, %r11d, %xmm5, %xmm5 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $3, %edx, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: vpinsrb $3, %r8d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: vpinsrb $5, %edx, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %esi -; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5 +; KNL-NEXT: vpinsrb $7, %ebx, %xmm5, %xmm5 ; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: vpinsrb $8, %ebp, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: vpinsrb $9, %r14d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: vpinsrb $10, %r15d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: vpinsrb $11, %edi, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $12, %ebp, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: vpinsrb $12, %r10d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $13, %r11d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: vpinsrb $13, %r9d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $14, %r14d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: vpinsrb $14, %r12d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $15, %r15d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: vpinsrb $15, %r13d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vmovd %eax, %xmm6 -; KNL-NEXT: kmovw %k0, %r15d +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %edx, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $3, %ecx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %r13d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $5, %edx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6 ; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: vpinsrb $7, %ebp, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: vpinsrb $8, %ebx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: vpinsrb $9, %r11d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %ebx, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: vpinsrb $10, %edi, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %edi ; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %ebp, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: vpinsrb $11, %r10d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: vpinsrb $12, %r9d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r9d ; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $13, %r11d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: vpinsrb $13, %r14d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $14, %r14d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r14d +; KNL-NEXT: vpinsrb $14, %r15d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k0, %r15d -; KNL-NEXT: kshiftrw $15, %k1, %k0 -; KNL-NEXT: vmovd %r12d, %xmm7 +; KNL-NEXT: vpinsrb $15, %r12d, %xmm6, %xmm6 ; KNL-NEXT: kmovw %k0, %r12d -; KNL-NEXT: vpinsrb $1, %ecx, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $2, %edx, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $3, %r13d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $4, %eax, %xmm7, %xmm7 +; KNL-NEXT: kshiftrw $15, %k1, %k0 +; KNL-NEXT: vmovd %eax, %xmm7 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $1, %r13d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $2, %ecx, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $3, %r8d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $4, %edx, %xmm7, %xmm7 ; KNL-NEXT: vpinsrb $5, %esi, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $7, %r8d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $8, %r9d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $9, %ebx, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $11, %r10d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $12, %r11d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $13, %r14d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $6, %ebp, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $7, %ebx, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $8, %r11d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $9, %edi, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $10, %r10d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $11, %r9d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $12, %r14d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $13, %r15d, %xmm7, %xmm7 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 ; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 @@ -1791,8 +1791,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 ; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpinsrb $14, %r15d, %xmm7, %xmm4 -; KNL-NEXT: vpinsrb $15, %r12d, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm4 +; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 ; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index d6174bc..437c7f5 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -201,6 +201,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { ; KNL-NEXT: kshiftlw $11, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: je LBB10_2 ; KNL-NEXT: ## BB#1: ## %A @@ -216,6 +217,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { ; SKX-NEXT: kshiftlw $11, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je LBB10_2 ; SKX-NEXT: ## BB#1: ## %A @@ -243,6 +245,7 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax @@ -256,6 +259,7 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax @@ -271,6 +275,7 @@ define i16 @test13(i32 %a, i32 %b) { ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: movw $-4, %ax ; KNL-NEXT: kmovw %eax, %k1 @@ -284,6 +289,7 @@ define i16 @test13(i32 %a, i32 %b) { ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovw %eax, %k0 ; SKX-NEXT: movw $-4, %ax ; SKX-NEXT: kmovw %eax, %k1 @@ -305,6 +311,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { ; KNL-NEXT: kshiftlw $11, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax @@ -316,6 +323,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { ; SKX-NEXT: kshiftlb $3, %k0, %k0 ; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax @@ -1033,10 +1041,175 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) { } define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) { +; KNL-LABEL: test_insertelement_v32i1: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Ltmp0: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Ltmp1: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Ltmp2: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $32, %rsp +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm1 +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; KNL-NEXT: sbbl %eax, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: movl (%rsp), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; ; SKX-LABEL: test_insertelement_v32i1: ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovw %eax, %k0 ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k1 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k2 @@ -1056,10 +1229,52 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> } define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) { +; KNL-LABEL: test_iinsertelement_v4i1: +; KNL: ## BB#0: +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpextrd $1, %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7] +; KNL-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm2 +; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} +; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7] +; KNL-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 +; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} +; KNL-NEXT: vpextrd $3, %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7] +; KNL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; ; SKX-LABEL: test_iinsertelement_v4i1: ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovw %eax, %k0 ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k1 ; SKX-NEXT: vpmovm2d %k1, %xmm0 @@ -1078,10 +1293,34 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) } define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) { +; KNL-LABEL: test_iinsertelement_v2i1: +; KNL: ## BB#0: +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; ; SKX-LABEL: test_iinsertelement_v2i1: ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovw %eax, %k0 ; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ; SKX-NEXT: kshiftlw $1, %k1, %k1 @@ -1118,6 +1357,7 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) { ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: sete %al ; SKX-NEXT: addb $3, %al @@ -1146,6 +1386,7 @@ define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) { ; SKX-NEXT: kshiftlw $12, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq %t1 = icmp ugt <4 x i32> %a, %b %t2 = extractelement <4 x i1> %t1, i32 3 @@ -1170,6 +1411,7 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) { ; SKX-NEXT: kshiftld $29, %k0, %k0 ; SKX-NEXT: kshiftrd $31, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq %t1 = icmp ugt <32 x i8> %a, %b %t2 = extractelement <32 x i1> %t1, i32 2 @@ -1197,6 +1439,7 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: sete %al ; SKX-NEXT: addb $3, %al diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 4d89c4f..fe9086c 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -7,9 +7,11 @@ define i32 @test_kortestz(i16 %a0, i16 %a1) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k0 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kortestw %k0, %k1 ; CHECK-NEXT: sete %al +; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) ret i32 %res @@ -4765,7 +4767,8 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) @@ -4786,7 +4789,8 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; CHECK-NEXT: kandw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) @@ -4809,7 +4813,8 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 % ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) @@ -4827,11 +4832,12 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k2 {%k1} ; CHECK-NEXT: kmovw %k2, %ecx ; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1} -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %edx +; CHECK-NEXT: kmovw %k1, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: andb %dl, %al -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8) diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index a9641e7..6bf9931 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -173,6 +173,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: kshiftlw $10, %k0, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 @@ -187,6 +188,8 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: kshiftlw $10, %k0, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 @@ -201,7 +204,8 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: kshiftlw $10, %k0, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 @@ -606,6 +610,7 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 @@ -1760,7 +1765,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $6, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %esi @@ -1775,7 +1780,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vmovd %r10d, %xmm2 @@ -1789,12 +1794,12 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 @@ -1828,7 +1833,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %esi @@ -1843,7 +1848,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vmovd %r10d, %xmm1 @@ -1857,12 +1862,12 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index f836009..e684ad9 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -491,10 +491,12 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addb %cl, %al -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1) @@ -511,10 +513,12 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addb %cl, %al -; CHECK-NEXT: ## kill: %AL %AL %AX +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1) diff --git a/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll b/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll index 290bcaa..a6de1e0 100644 --- a/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll +++ b/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll @@ -15,6 +15,7 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex ; ; AVX512-LABEL: select_cmov_i16: ; AVX512: ## BB#0: +; AVX512-NEXT: andl $1, %edi ; AVX512-NEXT: kmovw %edi, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: cmovew %dx, %si @@ -46,6 +47,7 @@ define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) { ; ; AVX512-LABEL: select_cmov_i32: ; AVX512: ## BB#0: +; AVX512-NEXT: andl $1, %edi ; AVX512-NEXT: kmovw %edi, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: cmovel %edx, %esi @@ -77,6 +79,7 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) { ; ; AVX512-LABEL: select_cmov_i64: ; AVX512: ## BB#0: +; AVX512-NEXT: andl $1, %edi ; AVX512-NEXT: kmovw %edi, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: cmoveq %rdx, %rsi diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 85cfc62..61dc7c7 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX @@ -38,6 +39,14 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) { ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test1: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -87,6 +96,14 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test2: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -120,6 +137,14 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test3: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -163,6 +188,17 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test4: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: kmovw %k1, %k2 +; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} +; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2 +; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -215,6 +251,15 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test5: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: kmovw %k1, %k2 +; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} +; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -267,6 +312,15 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; SKX-NEXT: vmovdqa64 %ymm2, %ymm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test6: +; SKX_32: # BB#0: +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: kxnorw %k0, %k0, %k2 +; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2} +; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1} +; SKX_32-NEXT: vmovdqa64 %ymm2, %ymm0 +; SKX_32-NEXT: retl %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef) @@ -309,6 +363,17 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test7: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: kmovw %k1, %k2 +; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2} +; SKX_32-NEXT: vmovdqa64 %ymm1, %ymm2 +; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1} +; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer @@ -415,13 +480,13 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; KNL_32-LABEL: test9: ; KNL_32: # BB#0: # %entry ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 -; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3 +; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 -; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3 +; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1 +; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 @@ -441,6 +506,18 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test9: +; SKX_32: # BB#0: # %entry +; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1 +; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 +; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1 +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} +; SKX_32-NEXT: retl entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer @@ -477,13 +554,13 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; KNL_32-LABEL: test10: ; KNL_32: # BB#0: # %entry ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 -; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3 +; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 -; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3 +; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1 +; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 @@ -503,6 +580,18 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test10: +; SKX_32: # BB#0: # %entry +; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1 +; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 +; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1 +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} +; SKX_32-NEXT: retl entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer @@ -535,6 +624,14 @@ define <16 x float> @test11(float* %base, i32 %ind) { ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test11: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -568,6 +665,14 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) { ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test12: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind @@ -596,6 +701,13 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) { ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test13: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind @@ -675,25 +787,29 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; ; KNL_64-LABEL: test15: ; KNL_64: # BB#0: -; KNL_64: vpxor %ymm2, %ymm2, %ymm2 +; KNL_64-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} -; KNL_64-NEXT: # kill +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test15: ; KNL_32: # BB#0: -; KNL_32: vpxor %ymm2, %ymm2, %ymm2 +; KNL_32-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} -; KNL_32-NEXT: # kill +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test15: @@ -724,7 +840,9 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; ; KNL_64-LABEL: test16: ; KNL_64: # BB#0: -; KNL_64: vpslld $31, %xmm1, %xmm1 +; KNL_64-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 @@ -738,14 +856,16 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; ; KNL_32-LABEL: test16: ; KNL_32: # BB#0: -; KNL_32: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %zmm2, %zmm0 @@ -778,7 +898,9 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; ; KNL_64-LABEL: test17: ; KNL_64: # BB#0: -; KNL_64: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: # kill: %XMM2 %XMM2 %ZMM2 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -788,10 +910,12 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; ; KNL_32-LABEL: test17: ; KNL_32: # BB#0: -; KNL_32: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: # kill: %XMM2 %XMM2 %ZMM2 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %zmm2, %zmm0 @@ -830,7 +954,10 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; ; KNL_64-LABEL: test18: ; KNL_64: # BB#0: -; KNL_64: vpxor %ymm3, %ymm3, %ymm3 +; KNL_64-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; KNL_64-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 @@ -839,7 +966,10 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; ; KNL_32-LABEL: test18: ; KNL_32: # BB#0: -; KNL_32: vpxor %ymm3, %ymm3, %ymm3 +; KNL_32-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; KNL_32-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 @@ -868,7 +998,9 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; ; KNL_64-LABEL: test19: ; KNL_64: # BB#0: -; KNL_64: vpslld $31, %xmm1, %xmm1 +; KNL_64-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; KNL_64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 @@ -880,13 +1012,15 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; ; KNL_32-LABEL: test19: ; KNL_32: # BB#0: -; KNL_32: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; KNL_32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpsllvq .LCPI18_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} ; KNL_32-NEXT: retl @@ -915,7 +1049,9 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; KNL_64-LABEL: test20: ; KNL_64: # BB#0: -; KNL_64: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL_64-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -926,7 +1062,8 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; KNL_32-LABEL: test20: ; KNL_32: # BB#0: -; KNL_32: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -939,7 +1076,8 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; SKX-LABEL: test20: ; SKX: # BB#0: -; SKX: vpsllq $63, %xmm2, %xmm2 +; SKX-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 @@ -964,7 +1102,8 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; KNL_64-LABEL: test21: ; KNL_64: # BB#0: -; KNL_64: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 @@ -974,17 +1113,19 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; KNL_32-LABEL: test21: ; KNL_32: # BB#0: -; KNL_32: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test21: ; SKX: # BB#0: -; SKX: vpsllq $63, %xmm2, %xmm2 +; SKX-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 @@ -994,7 +1135,8 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; SKX_32-LABEL: test21: ; SKX_32: # BB#0: -; SKX_32: vpsllq $63, %xmm2, %xmm2 +; SKX_32-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 @@ -1013,7 +1155,8 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; ; KNL_64-LABEL: test22: ; KNL_64: # BB#0: -; KNL_64: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_64-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1027,7 +1170,8 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; ; KNL_32-LABEL: test22: ; KNL_32: # BB#0: -; KNL_32: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1075,7 +1219,9 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; ; KNL_64-LABEL: test23: ; KNL_64: # BB#0: -; KNL_64: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: # kill: %XMM2 %XMM2 %ZMM2 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -1085,10 +1231,12 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; ; KNL_32-LABEL: test23: ; KNL_32: # BB#0: -; KNL_32: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: # kill: %XMM2 %XMM2 %ZMM2 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 @@ -1119,7 +1267,8 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # BB#0: -; KNL_64: movb $3, %al +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1127,10 +1276,11 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; ; KNL_32-LABEL: test24: ; KNL_32: # BB#0: -; KNL_32: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 -; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1 +; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1160,7 +1310,9 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; ; KNL_64-LABEL: test25: ; KNL_64: # BB#0: -; KNL_64: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: # kill: %XMM2 %XMM2 %ZMM2 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -1170,10 +1322,12 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; ; KNL_32-LABEL: test25: ; KNL_32: # BB#0: -; KNL_32: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: # kill: %XMM2 %XMM2 %ZMM2 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 @@ -1205,7 +1359,9 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; ; KNL_64-LABEL: test26: ; KNL_64: # BB#0: -; KNL_64: movb $3, %al +; KNL_64-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1213,10 +1369,12 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; ; KNL_32-LABEL: test26: ; KNL_32: # BB#0: -; KNL_32: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 -; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2 +; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1252,7 +1410,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT: # kill +; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: @@ -1263,7 +1421,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_32-NEXT: movb $3, %cl ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT: # kill +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: @@ -1273,6 +1431,15 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test27: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: movb $3, %cl +; SKX_32-NEXT: kmovb %ecx, %k1 +; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1} +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> , <2 x float> undef) @@ -1285,7 +1452,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; KNL_64-LABEL: test28: ; KNL_64: # BB#0: -; KNL_64: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1293,17 +1461,19 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; KNL_32-LABEL: test28: ; KNL_32: # BB#0: -; KNL_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 -; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2 +; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test28: ; SKX: # BB#0: -; SKX: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} @@ -1311,7 +1481,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; SKX_32-LABEL: test28: ; SKX_32: # BB#0: -; SKX_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32-NEXT: # kill: %XMM1 %XMM1 %YMM1 +; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovb %eax, %k1 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} @@ -1353,6 +1524,15 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) { ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test29: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: movw $44, %cx +; SKX_32-NEXT: kmovw %ecx, %k1 +; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -1370,9 +1550,12 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; KNL_64-LABEL: test30: ; KNL_64: # BB#0: ; KNL_64-NEXT: andl $1, %edx +; KNL_64-NEXT: kmovw %edx, %k1 ; KNL_64-NEXT: andl $1, %esi +; KNL_64-NEXT: kmovw %esi, %k2 ; KNL_64-NEXT: movl %edi, %eax ; KNL_64-NEXT: andl $1, %eax +; KNL_64-NEXT: kmovw %eax, %k0 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 @@ -1380,76 +1563,81 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; KNL_64-NEXT: testb $1, %dil ; KNL_64-NEXT: je .LBB29_2 ; KNL_64-NEXT: # BB#1: # %cond.load -; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovq %xmm1, %rax ; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; KNL_64-NEXT: .LBB29_2: # %else -; KNL_64-NEXT: testb %sil, %sil +; KNL_64-NEXT: kmovw %k2, %eax +; KNL_64-NEXT: movl %eax, %ecx +; KNL_64-NEXT: andl $1, %ecx +; KNL_64-NEXT: testb %cl, %cl ; KNL_64-NEXT: je .LBB29_4 ; KNL_64-NEXT: # BB#3: # %cond.load1 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 ; KNL_64-NEXT: .LBB29_4: # %else2 +; KNL_64-NEXT: kmovw %k1, %ecx +; KNL_64-NEXT: movl %ecx, %edx +; KNL_64-NEXT: andl $1, %edx ; KNL_64-NEXT: testb %dl, %dl ; KNL_64-NEXT: je .LBB29_6 ; KNL_64-NEXT: # BB#5: # %cond.load4 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL_64-NEXT: vmovq %xmm1, %rcx -; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %xmm1, %rdx +; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 ; KNL_64-NEXT: .LBB29_6: # %else5 -; KNL_64-NEXT: vmovd %eax, %xmm1 -; KNL_64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; KNL_64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 +; KNL_64-NEXT: kmovw %k0, %edx +; KNL_64-NEXT: vmovd %edx, %xmm1 +; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test30: ; KNL_32: # BB#0: -; KNL_32-NEXT: pushl %ebx -; KNL_32-NEXT: .Ltmp0: -; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: pushl %esi -; KNL_32-NEXT: .Ltmp1: -; KNL_32-NEXT: .cfi_def_cfa_offset 12 -; KNL_32-NEXT: .Ltmp2: -; KNL_32-NEXT: .cfi_offset %esi, -12 -; KNL_32-NEXT: .Ltmp3: -; KNL_32-NEXT: .cfi_offset %ebx, -8 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: andl $1, %eax -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k2 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movl %eax, %ecx ; KNL_32-NEXT: andl $1, %ecx -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; KNL_32-NEXT: movl %ebx, %edx -; KNL_32-NEXT: andl $1, %edx +; KNL_32-NEXT: kmovw %ecx, %k0 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; KNL_32-NEXT: # implicit-def: %XMM0 -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: je .LBB29_2 ; KNL_32-NEXT: # BB#1: # %cond.load -; KNL_32-NEXT: vmovd %xmm1, %esi +; KNL_32-NEXT: vmovd %xmm1, %eax ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; KNL_32-NEXT: .LBB29_2: # %else +; KNL_32-NEXT: kmovw %k2, %eax +; KNL_32-NEXT: movl %eax, %ecx +; KNL_32-NEXT: andl $1, %ecx ; KNL_32-NEXT: testb %cl, %cl ; KNL_32-NEXT: je .LBB29_4 ; KNL_32-NEXT: # BB#3: # %cond.load1 -; KNL_32-NEXT: vpextrd $1, %xmm1, %esi -; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 ; KNL_32-NEXT: .LBB29_4: # %else2 -; KNL_32-NEXT: testb %al, %al +; KNL_32-NEXT: kmovw %k1, %ecx +; KNL_32-NEXT: movl %ecx, %edx +; KNL_32-NEXT: andl $1, %edx +; KNL_32-NEXT: testb %dl, %dl ; KNL_32-NEXT: je .LBB29_6 ; KNL_32-NEXT: # BB#5: # %cond.load4 -; KNL_32-NEXT: vpextrd $2, %xmm1, %esi -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $2, %xmm1, %edx +; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0 ; KNL_32-NEXT: .LBB29_6: # %else5 +; KNL_32-NEXT: kmovw %k0, %edx ; KNL_32-NEXT: vmovd %edx, %xmm1 -; KNL_32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 -; KNL_32-NEXT: popl %esi -; KNL_32-NEXT: popl %ebx ; KNL_32-NEXT: retl ; ; SKX-LABEL: test30: @@ -1460,35 +1648,38 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 -; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: # implicit-def: %XMM0 +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: # implicit-def: %XMM1 ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je .LBB29_2 ; SKX-NEXT: # BB#1: # %cond.load -; SKX-NEXT: vmovq %xmm1, %rax -; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SKX-NEXT: .LBB29_2: # %else ; SKX-NEXT: kshiftlw $14, %k1, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je .LBB29_4 ; SKX-NEXT: # BB#3: # %cond.load1 -; SKX-NEXT: vpextrq $1, %xmm1, %rax -; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1 ; SKX-NEXT: .LBB29_4: # %else2 ; SKX-NEXT: kshiftlw $13, %k1, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je .LBB29_6 ; SKX-NEXT: # BB#5: # %cond.load4 -; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm1 -; SKX-NEXT: vmovq %xmm1, %rax -; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 +; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpinsrd $2, (%rax), %xmm1, %xmm1 ; SKX-NEXT: .LBB29_6: # %else5 -; SKX-NEXT: vpblendmd %xmm0, %xmm3, %xmm0 {%k1} +; SKX-NEXT: vpblendmd %xmm1, %xmm3, %xmm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test30: @@ -1501,35 +1692,38 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX_32-NEXT: kshiftlw $15, %k1, %k0 ; SKX_32-NEXT: kshiftrw $15, %k0, %k0 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 -; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: # implicit-def: %XMM0 +; SKX_32-NEXT: andl $1, %eax +; SKX_32-NEXT: # implicit-def: %XMM1 ; SKX_32-NEXT: testb %al, %al ; SKX_32-NEXT: je .LBB29_2 ; SKX_32-NEXT: # BB#1: # %cond.load -; SKX_32-NEXT: vmovd %xmm1, %eax -; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX_32-NEXT: vmovd %xmm0, %eax +; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SKX_32-NEXT: .LBB29_2: # %else ; SKX_32-NEXT: kshiftlw $14, %k1, %k0 ; SKX_32-NEXT: kshiftrw $15, %k0, %k0 ; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: andl $1, %eax ; SKX_32-NEXT: testb %al, %al ; SKX_32-NEXT: je .LBB29_4 ; SKX_32-NEXT: # BB#3: # %cond.load1 -; SKX_32-NEXT: vpextrd $1, %xmm1, %eax -; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1 ; SKX_32-NEXT: .LBB29_4: # %else2 ; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2 ; SKX_32-NEXT: kshiftlw $13, %k1, %k0 ; SKX_32-NEXT: kshiftrw $15, %k0, %k0 ; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: andl $1, %eax ; SKX_32-NEXT: testb %al, %al ; SKX_32-NEXT: je .LBB29_6 ; SKX_32-NEXT: # BB#5: # %cond.load4 -; SKX_32-NEXT: vpextrd $2, %xmm1, %eax -; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpextrd $2, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1 ; SKX_32-NEXT: .LBB29_6: # %else5 -; SKX_32-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1} +; SKX_32-NEXT: vpblendmd %xmm1, %xmm2, %xmm0 {%k1} ; SKX_32-NEXT: addl $12, %esp ; SKX_32-NEXT: retl @@ -1646,12 +1840,12 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-LABEL: test_gather_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp4: +; KNL_32-NEXT: .Ltmp0: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp5: +; KNL_32-NEXT: .Ltmp1: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp6: +; KNL_32-NEXT: .Ltmp2: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1769,12 +1963,12 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_32-LABEL: test_gather_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp7: +; KNL_32-NEXT: .Ltmp3: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp8: +; KNL_32-NEXT: .Ltmp4: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp9: +; KNL_32-NEXT: .Ltmp5: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1886,12 +2080,12 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; KNL_32-LABEL: test_scatter_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp10: +; KNL_32-NEXT: .Ltmp6: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp11: +; KNL_32-NEXT: .Ltmp7: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp12: +; KNL_32-NEXT: .Ltmp8: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -2000,12 +2194,12 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; KNL_32-LABEL: test_scatter_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp13: +; KNL_32-NEXT: .Ltmp9: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp14: +; KNL_32-NEXT: .Ltmp10: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp15: +; KNL_32-NEXT: .Ltmp11: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index f9118fe..f2f91ee 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -2346,6 +2346,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $15, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: ## implicit-def: %XMM0 ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_2 @@ -2356,6 +2357,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $14, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_4 ; AVX512F-NEXT: ## BB#3: ## %cond.load1 @@ -2364,6 +2366,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $13, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_6 ; AVX512F-NEXT: ## BB#5: ## %cond.load4 @@ -2372,6 +2375,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $12, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_8 ; AVX512F-NEXT: ## BB#7: ## %cond.load7 @@ -2380,6 +2384,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $11, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_10 ; AVX512F-NEXT: ## BB#9: ## %cond.load10 @@ -2388,6 +2393,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $10, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_12 ; AVX512F-NEXT: ## BB#11: ## %cond.load13 @@ -2396,6 +2402,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $9, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_14 ; AVX512F-NEXT: ## BB#13: ## %cond.load16 @@ -2404,6 +2411,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $8, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_16 ; AVX512F-NEXT: ## BB#15: ## %cond.load19 @@ -2412,6 +2420,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $7, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_18 ; AVX512F-NEXT: ## BB#17: ## %cond.load22 @@ -2420,6 +2429,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $6, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_20 ; AVX512F-NEXT: ## BB#19: ## %cond.load25 @@ -2428,6 +2438,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $5, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_22 ; AVX512F-NEXT: ## BB#21: ## %cond.load28 @@ -2436,6 +2447,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $4, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_24 ; AVX512F-NEXT: ## BB#23: ## %cond.load31 @@ -2444,6 +2456,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $3, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_26 ; AVX512F-NEXT: ## BB#25: ## %cond.load34 @@ -2452,6 +2465,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $2, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_28 ; AVX512F-NEXT: ## BB#27: ## %cond.load37 @@ -2460,6 +2474,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: kshiftlw $1, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_30 ; AVX512F-NEXT: ## BB#29: ## %cond.load40 @@ -2467,6 +2482,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: LBB50_30: ## %else41 ; AVX512F-NEXT: kshiftrw $15, %k1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB50_32 ; AVX512F-NEXT: ## BB#31: ## %cond.load43 @@ -4613,6 +4629,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_2 ; AVX512F-NEXT: ## BB#1: ## %cond.load @@ -4623,6 +4640,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_4 ; AVX512F-NEXT: ## BB#3: ## %cond.load1 @@ -4633,6 +4651,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_6 ; AVX512F-NEXT: ## BB#5: ## %cond.load4 @@ -4643,6 +4662,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_8 ; AVX512F-NEXT: ## BB#7: ## %cond.load7 @@ -4653,6 +4673,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_10 ; AVX512F-NEXT: ## BB#9: ## %cond.load10 @@ -4663,6 +4684,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_12 ; AVX512F-NEXT: ## BB#11: ## %cond.load13 @@ -4673,6 +4695,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_14 ; AVX512F-NEXT: ## BB#13: ## %cond.load16 @@ -4683,6 +4706,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_16 ; AVX512F-NEXT: ## BB#15: ## %cond.load19 @@ -4693,6 +4717,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_18 ; AVX512F-NEXT: ## BB#17: ## %cond.load22 @@ -4703,6 +4728,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, (%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_20 ; AVX512F-NEXT: ## BB#19: ## %cond.load25 @@ -4713,6 +4739,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_22 ; AVX512F-NEXT: ## BB#21: ## %cond.load28 @@ -4723,6 +4750,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_24 ; AVX512F-NEXT: ## BB#23: ## %cond.load31 @@ -4733,6 +4761,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_26 ; AVX512F-NEXT: ## BB#25: ## %cond.load34 @@ -4744,6 +4773,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_28 ; AVX512F-NEXT: ## BB#27: ## %cond.load37 @@ -4755,6 +4785,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_30 ; AVX512F-NEXT: ## BB#29: ## %cond.load40 @@ -4765,6 +4796,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_32 ; AVX512F-NEXT: ## BB#31: ## %cond.load43 @@ -4775,6 +4807,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_34 ; AVX512F-NEXT: ## BB#33: ## %cond.load46 @@ -4786,6 +4819,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_36 ; AVX512F-NEXT: ## BB#35: ## %cond.load49 @@ -4797,6 +4831,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_38 ; AVX512F-NEXT: ## BB#37: ## %cond.load52 @@ -4808,6 +4843,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_40 ; AVX512F-NEXT: ## BB#39: ## %cond.load55 @@ -4819,6 +4855,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_42 ; AVX512F-NEXT: ## BB#41: ## %cond.load58 @@ -4830,6 +4867,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_44 ; AVX512F-NEXT: ## BB#43: ## %cond.load61 @@ -4841,6 +4879,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_46 ; AVX512F-NEXT: ## BB#45: ## %cond.load64 @@ -4852,6 +4891,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_48 ; AVX512F-NEXT: ## BB#47: ## %cond.load67 @@ -4863,6 +4903,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_50 ; AVX512F-NEXT: ## BB#49: ## %cond.load70 @@ -4874,6 +4915,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_52 ; AVX512F-NEXT: ## BB#51: ## %cond.load73 @@ -4885,6 +4927,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_54 ; AVX512F-NEXT: ## BB#53: ## %cond.load76 @@ -4896,6 +4939,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_56 ; AVX512F-NEXT: ## BB#55: ## %cond.load79 @@ -4907,6 +4951,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_58 ; AVX512F-NEXT: ## BB#57: ## %cond.load82 @@ -4919,6 +4964,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_60 ; AVX512F-NEXT: ## BB#59: ## %cond.load85 @@ -4931,6 +4977,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_62 ; AVX512F-NEXT: ## BB#61: ## %cond.load88 @@ -4942,6 +4989,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_64 ; AVX512F-NEXT: ## BB#63: ## %cond.load91 @@ -4953,6 +5001,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_66 ; AVX512F-NEXT: ## BB#65: ## %cond.load94 @@ -4963,6 +5012,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_68 ; AVX512F-NEXT: ## BB#67: ## %cond.load97 @@ -4973,6 +5023,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_70 ; AVX512F-NEXT: ## BB#69: ## %cond.load100 @@ -4983,6 +5034,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_72 ; AVX512F-NEXT: ## BB#71: ## %cond.load103 @@ -4993,6 +5045,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_74 ; AVX512F-NEXT: ## BB#73: ## %cond.load106 @@ -5003,6 +5056,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_76 ; AVX512F-NEXT: ## BB#75: ## %cond.load109 @@ -5013,6 +5067,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_78 ; AVX512F-NEXT: ## BB#77: ## %cond.load112 @@ -5023,6 +5078,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_80 ; AVX512F-NEXT: ## BB#79: ## %cond.load115 @@ -5033,6 +5089,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_82 ; AVX512F-NEXT: ## BB#81: ## %cond.load118 @@ -5043,6 +5100,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_84 ; AVX512F-NEXT: ## BB#83: ## %cond.load121 @@ -5053,6 +5111,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_86 ; AVX512F-NEXT: ## BB#85: ## %cond.load124 @@ -5063,6 +5122,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_88 ; AVX512F-NEXT: ## BB#87: ## %cond.load127 @@ -5073,6 +5133,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_90 ; AVX512F-NEXT: ## BB#89: ## %cond.load130 @@ -5084,6 +5145,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_92 ; AVX512F-NEXT: ## BB#91: ## %cond.load133 @@ -5095,26 +5157,29 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_94 ; AVX512F-NEXT: ## BB#93: ## %cond.load136 ; AVX512F-NEXT: vpinsrb $14, 46(%rdi), %xmm1, %xmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: LBB52_94: ## %else137 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k5 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_96 ; AVX512F-NEXT: ## BB#95: ## %cond.load139 ; AVX512F-NEXT: vpinsrb $15, 47(%rdi), %xmm1, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: LBB52_96: ## %else140 -; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftlw $15, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_98 ; AVX512F-NEXT: ## BB#97: ## %cond.load142 @@ -5122,10 +5187,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $0, 48(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_98: ## %else143 -; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_100 ; AVX512F-NEXT: ## BB#99: ## %cond.load145 @@ -5133,10 +5199,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $1, 49(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_100: ## %else146 -; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftlw $13, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_102 ; AVX512F-NEXT: ## BB#101: ## %cond.load148 @@ -5144,10 +5211,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $2, 50(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_102: ## %else149 -; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftlw $12, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_104 ; AVX512F-NEXT: ## BB#103: ## %cond.load151 @@ -5155,10 +5223,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $3, 51(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_104: ## %else152 -; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftlw $11, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_106 ; AVX512F-NEXT: ## BB#105: ## %cond.load154 @@ -5166,10 +5235,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $4, 52(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_106: ## %else155 -; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftlw $10, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_108 ; AVX512F-NEXT: ## BB#107: ## %cond.load157 @@ -5177,10 +5247,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $5, 53(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_108: ## %else158 -; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftlw $9, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_110 ; AVX512F-NEXT: ## BB#109: ## %cond.load160 @@ -5188,10 +5259,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $6, 54(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_110: ## %else161 -; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_112 ; AVX512F-NEXT: ## BB#111: ## %cond.load163 @@ -5199,10 +5271,11 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $7, 55(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_112: ## %else164 -; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftlw $7, %k5, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_114 ; AVX512F-NEXT: ## BB#113: ## %cond.load166 @@ -5210,9 +5283,10 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $8, 56(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_114: ## %else167 -; AVX512F-NEXT: kshiftlw $6, %k1, %k2 -; AVX512F-NEXT: kshiftrw $15, %k2, %k2 +; AVX512F-NEXT: kshiftlw $6, %k5, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k2 ; AVX512F-NEXT: kmovw %k2, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_116 ; AVX512F-NEXT: ## BB#115: ## %cond.load169 @@ -5220,9 +5294,10 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $9, 57(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_116: ## %else170 -; AVX512F-NEXT: kshiftlw $5, %k1, %k3 -; AVX512F-NEXT: kshiftrw $15, %k3, %k3 +; AVX512F-NEXT: kshiftlw $5, %k5, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k3 ; AVX512F-NEXT: kmovw %k3, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_118 ; AVX512F-NEXT: ## BB#117: ## %cond.load172 @@ -5230,9 +5305,10 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $10, 58(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_118: ## %else173 -; AVX512F-NEXT: kshiftlw $4, %k1, %k4 -; AVX512F-NEXT: kshiftrw $15, %k4, %k4 +; AVX512F-NEXT: kshiftlw $4, %k5, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k4 ; AVX512F-NEXT: kmovw %k4, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_120 ; AVX512F-NEXT: ## BB#119: ## %cond.load175 @@ -5240,9 +5316,10 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $11, 59(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_120: ## %else176 -; AVX512F-NEXT: kshiftlw $3, %k1, %k5 -; AVX512F-NEXT: kshiftrw $15, %k5, %k5 -; AVX512F-NEXT: kmovw %k5, %eax +; AVX512F-NEXT: kshiftlw $3, %k5, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k6 +; AVX512F-NEXT: kmovw %k6, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_122 ; AVX512F-NEXT: ## BB#121: ## %cond.load178 @@ -5250,9 +5327,10 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $12, 60(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_122: ## %else179 -; AVX512F-NEXT: kshiftlw $2, %k1, %k6 -; AVX512F-NEXT: kshiftrw $15, %k6, %k6 -; AVX512F-NEXT: kmovw %k6, %eax +; AVX512F-NEXT: kshiftlw $2, %k5, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k7 +; AVX512F-NEXT: kmovw %k7, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_124 ; AVX512F-NEXT: ## BB#123: ## %cond.load181 @@ -5260,9 +5338,10 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $13, 61(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_124: ## %else182 -; AVX512F-NEXT: kshiftlw $1, %k1, %k7 -; AVX512F-NEXT: kshiftrw $15, %k7, %k7 -; AVX512F-NEXT: kmovw %k7, %eax +; AVX512F-NEXT: kshiftlw $1, %k5, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_126 ; AVX512F-NEXT: ## BB#125: ## %cond.load184 @@ -5270,8 +5349,9 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $14, 62(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_126: ## %else185 -; AVX512F-NEXT: kshiftrw $15, %k1, %k1 -; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftrw $15, %k5, %k5 +; AVX512F-NEXT: kmovw %k5, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB52_128 ; AVX512F-NEXT: ## BB#127: ## %cond.load187 @@ -5279,137 +5359,137 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $15, 63(%rdi), %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: LBB52_128: ## %else188 -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw (%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw (%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, (%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; AVX512F-NEXT: kmovw %k2, %eax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill ; AVX512F-NEXT: kmovw %k3, %r12d ; AVX512F-NEXT: kmovw %k4, %r15d -; AVX512F-NEXT: kmovw %k5, %r14d -; AVX512F-NEXT: kmovw %k6, %ebx -; AVX512F-NEXT: kmovw %k7, %r11d -; AVX512F-NEXT: kmovw %k1, %r10d +; AVX512F-NEXT: kmovw %k6, %r14d +; AVX512F-NEXT: kmovw %k7, %ebx +; AVX512F-NEXT: kmovw %k0, %r11d +; AVX512F-NEXT: kmovw %k5, %r10d ; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill @@ -5482,7 +5562,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: kmovw %k0, %r15d ; AVX512F-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; AVX512F-NEXT: kmovw %k0, %r14d +; AVX512F-NEXT: kmovw %k0, %ebp ; AVX512F-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload ; AVX512F-NEXT: kmovw %k0, %ebx @@ -5510,7 +5590,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm2, %xmm2 ; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm2, %xmm2 ; AVX512F-NEXT: vpinsrb $10, %r15d, %xmm2, %xmm2 -; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $11, %ebp, %xmm2, %xmm2 ; AVX512F-NEXT: vpinsrb $12, %ebx, %xmm2, %xmm2 ; AVX512F-NEXT: vpinsrb $13, %r11d, %xmm2, %xmm2 ; AVX512F-NEXT: vpinsrb $14, %r10d, %xmm2, %xmm2 @@ -5609,6 +5689,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: ## implicit-def: %XMM0 ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_2 @@ -5619,6 +5700,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $14, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_4 ; AVX512F-NEXT: ## BB#3: ## %cond.load1 @@ -5627,6 +5709,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $13, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_6 ; AVX512F-NEXT: ## BB#5: ## %cond.load4 @@ -5635,6 +5718,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $12, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_8 ; AVX512F-NEXT: ## BB#7: ## %cond.load7 @@ -5643,6 +5727,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $11, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_10 ; AVX512F-NEXT: ## BB#9: ## %cond.load10 @@ -5651,6 +5736,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $10, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_12 ; AVX512F-NEXT: ## BB#11: ## %cond.load13 @@ -5659,6 +5745,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $9, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_14 ; AVX512F-NEXT: ## BB#13: ## %cond.load16 @@ -5667,6 +5754,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: kshiftlw $8, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB53_16 ; AVX512F-NEXT: ## BB#15: ## %cond.load19 @@ -5963,6 +6051,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $15, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: ## implicit-def: %YMM0 ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_2 @@ -5973,6 +6062,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $14, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_4 ; AVX512F-NEXT: ## BB#3: ## %cond.load1 @@ -5982,6 +6072,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $13, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_6 ; AVX512F-NEXT: ## BB#5: ## %cond.load4 @@ -5991,6 +6082,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $12, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_8 ; AVX512F-NEXT: ## BB#7: ## %cond.load7 @@ -6000,6 +6092,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $11, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_10 ; AVX512F-NEXT: ## BB#9: ## %cond.load10 @@ -6009,6 +6102,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $10, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_12 ; AVX512F-NEXT: ## BB#11: ## %cond.load13 @@ -6018,6 +6112,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $9, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_14 ; AVX512F-NEXT: ## BB#13: ## %cond.load16 @@ -6027,6 +6122,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $8, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_16 ; AVX512F-NEXT: ## BB#15: ## %cond.load19 @@ -6036,6 +6132,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $7, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_18 ; AVX512F-NEXT: ## BB#17: ## %cond.load22 @@ -6046,6 +6143,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $6, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_20 ; AVX512F-NEXT: ## BB#19: ## %cond.load25 @@ -6056,6 +6154,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $5, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_22 ; AVX512F-NEXT: ## BB#21: ## %cond.load28 @@ -6066,6 +6165,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $4, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_24 ; AVX512F-NEXT: ## BB#23: ## %cond.load31 @@ -6076,6 +6176,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $3, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_26 ; AVX512F-NEXT: ## BB#25: ## %cond.load34 @@ -6086,6 +6187,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $2, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_28 ; AVX512F-NEXT: ## BB#27: ## %cond.load37 @@ -6096,6 +6198,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: kshiftlw $1, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_30 ; AVX512F-NEXT: ## BB#29: ## %cond.load40 @@ -6105,6 +6208,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: LBB54_30: ## %else41 ; AVX512F-NEXT: kshiftrw $15, %k1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB54_32 ; AVX512F-NEXT: ## BB#31: ## %cond.load43 @@ -7022,6 +7126,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $15, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_2 ; AVX512F-NEXT: ## BB#1: ## %cond.store @@ -7030,6 +7135,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $14, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_4 ; AVX512F-NEXT: ## BB#3: ## %cond.store1 @@ -7038,6 +7144,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $13, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_6 ; AVX512F-NEXT: ## BB#5: ## %cond.store3 @@ -7046,6 +7153,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $12, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_8 ; AVX512F-NEXT: ## BB#7: ## %cond.store5 @@ -7054,6 +7162,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $11, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_10 ; AVX512F-NEXT: ## BB#9: ## %cond.store7 @@ -7062,6 +7171,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $10, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_12 ; AVX512F-NEXT: ## BB#11: ## %cond.store9 @@ -7070,6 +7180,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $9, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_14 ; AVX512F-NEXT: ## BB#13: ## %cond.store11 @@ -7078,6 +7189,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $8, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_16 ; AVX512F-NEXT: ## BB#15: ## %cond.store13 @@ -7086,6 +7198,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $7, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_18 ; AVX512F-NEXT: ## BB#17: ## %cond.store15 @@ -7094,6 +7207,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $6, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_20 ; AVX512F-NEXT: ## BB#19: ## %cond.store17 @@ -7102,6 +7216,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $5, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_22 ; AVX512F-NEXT: ## BB#21: ## %cond.store19 @@ -7110,6 +7225,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $4, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_24 ; AVX512F-NEXT: ## BB#23: ## %cond.store21 @@ -7118,6 +7234,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $3, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_26 ; AVX512F-NEXT: ## BB#25: ## %cond.store23 @@ -7126,6 +7243,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $2, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_28 ; AVX512F-NEXT: ## BB#27: ## %cond.store25 @@ -7134,6 +7252,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: kshiftlw $1, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_30 ; AVX512F-NEXT: ## BB#29: ## %cond.store27 @@ -7141,6 +7260,7 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> ; AVX512F-NEXT: LBB56_30: ## %else28 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB56_32 ; AVX512F-NEXT: ## BB#31: ## %cond.store29 @@ -8653,6 +8773,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $15, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_2 ; AVX512F-NEXT: ## BB#1: ## %cond.store @@ -8661,6 +8782,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $14, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_4 ; AVX512F-NEXT: ## BB#3: ## %cond.store1 @@ -8669,6 +8791,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $13, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_6 ; AVX512F-NEXT: ## BB#5: ## %cond.store3 @@ -8677,6 +8800,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $12, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_8 ; AVX512F-NEXT: ## BB#7: ## %cond.store5 @@ -8685,6 +8809,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $11, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_10 ; AVX512F-NEXT: ## BB#9: ## %cond.store7 @@ -8693,6 +8818,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $10, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_12 ; AVX512F-NEXT: ## BB#11: ## %cond.store9 @@ -8701,6 +8827,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $9, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_14 ; AVX512F-NEXT: ## BB#13: ## %cond.store11 @@ -8709,6 +8836,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $8, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_16 ; AVX512F-NEXT: ## BB#15: ## %cond.store13 @@ -8717,6 +8845,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $7, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_18 ; AVX512F-NEXT: ## BB#17: ## %cond.store15 @@ -8725,6 +8854,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $6, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_20 ; AVX512F-NEXT: ## BB#19: ## %cond.store17 @@ -8733,6 +8863,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $5, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_22 ; AVX512F-NEXT: ## BB#21: ## %cond.store19 @@ -8741,6 +8872,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $4, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_24 ; AVX512F-NEXT: ## BB#23: ## %cond.store21 @@ -8749,6 +8881,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $3, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_26 ; AVX512F-NEXT: ## BB#25: ## %cond.store23 @@ -8758,6 +8891,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $2, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_28 ; AVX512F-NEXT: ## BB#27: ## %cond.store25 @@ -8767,6 +8901,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $1, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_30 ; AVX512F-NEXT: ## BB#29: ## %cond.store27 @@ -8775,6 +8910,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_32 ; AVX512F-NEXT: ## BB#31: ## %cond.store29 @@ -8783,6 +8919,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $15, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_34 ; AVX512F-NEXT: ## BB#33: ## %cond.store31 @@ -8792,6 +8929,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $14, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_36 ; AVX512F-NEXT: ## BB#35: ## %cond.store33 @@ -8801,6 +8939,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $13, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_38 ; AVX512F-NEXT: ## BB#37: ## %cond.store35 @@ -8810,6 +8949,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $12, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_40 ; AVX512F-NEXT: ## BB#39: ## %cond.store37 @@ -8819,6 +8959,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $11, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_42 ; AVX512F-NEXT: ## BB#41: ## %cond.store39 @@ -8828,6 +8969,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $10, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_44 ; AVX512F-NEXT: ## BB#43: ## %cond.store41 @@ -8837,6 +8979,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $9, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_46 ; AVX512F-NEXT: ## BB#45: ## %cond.store43 @@ -8846,6 +8989,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $8, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_48 ; AVX512F-NEXT: ## BB#47: ## %cond.store45 @@ -8855,6 +8999,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $7, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_50 ; AVX512F-NEXT: ## BB#49: ## %cond.store47 @@ -8864,6 +9009,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $6, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_52 ; AVX512F-NEXT: ## BB#51: ## %cond.store49 @@ -8873,6 +9019,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $5, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_54 ; AVX512F-NEXT: ## BB#53: ## %cond.store51 @@ -8882,6 +9029,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $4, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_56 ; AVX512F-NEXT: ## BB#55: ## %cond.store53 @@ -8891,6 +9039,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $3, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_58 ; AVX512F-NEXT: ## BB#57: ## %cond.store55 @@ -8901,6 +9050,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $2, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_60 ; AVX512F-NEXT: ## BB#59: ## %cond.store57 @@ -8911,6 +9061,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $1, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_62 ; AVX512F-NEXT: ## BB#61: ## %cond.store59 @@ -8920,6 +9071,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_64 ; AVX512F-NEXT: ## BB#63: ## %cond.store61 @@ -8929,6 +9081,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $15, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_66 ; AVX512F-NEXT: ## BB#65: ## %cond.store63 @@ -8937,6 +9090,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $14, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_68 ; AVX512F-NEXT: ## BB#67: ## %cond.store65 @@ -8945,6 +9099,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $13, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_70 ; AVX512F-NEXT: ## BB#69: ## %cond.store67 @@ -8953,6 +9108,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $12, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_72 ; AVX512F-NEXT: ## BB#71: ## %cond.store69 @@ -8961,6 +9117,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $11, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_74 ; AVX512F-NEXT: ## BB#73: ## %cond.store71 @@ -8969,6 +9126,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $10, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_76 ; AVX512F-NEXT: ## BB#75: ## %cond.store73 @@ -8977,6 +9135,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $9, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_78 ; AVX512F-NEXT: ## BB#77: ## %cond.store75 @@ -8985,6 +9144,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $8, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_80 ; AVX512F-NEXT: ## BB#79: ## %cond.store77 @@ -8993,6 +9153,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $7, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_82 ; AVX512F-NEXT: ## BB#81: ## %cond.store79 @@ -9001,6 +9162,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $6, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_84 ; AVX512F-NEXT: ## BB#83: ## %cond.store81 @@ -9009,6 +9171,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $5, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_86 ; AVX512F-NEXT: ## BB#85: ## %cond.store83 @@ -9017,6 +9180,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $4, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_88 ; AVX512F-NEXT: ## BB#87: ## %cond.store85 @@ -9025,6 +9189,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $3, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_90 ; AVX512F-NEXT: ## BB#89: ## %cond.store87 @@ -9034,6 +9199,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $2, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_92 ; AVX512F-NEXT: ## BB#91: ## %cond.store89 @@ -9043,6 +9209,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $1, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_94 ; AVX512F-NEXT: ## BB#93: ## %cond.store91 @@ -9051,6 +9218,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_96 ; AVX512F-NEXT: ## BB#95: ## %cond.store93 @@ -9059,6 +9227,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $15, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_98 ; AVX512F-NEXT: ## BB#97: ## %cond.store95 @@ -9068,6 +9237,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $14, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_100 ; AVX512F-NEXT: ## BB#99: ## %cond.store97 @@ -9077,6 +9247,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $13, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_102 ; AVX512F-NEXT: ## BB#101: ## %cond.store99 @@ -9086,6 +9257,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $12, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_104 ; AVX512F-NEXT: ## BB#103: ## %cond.store101 @@ -9095,6 +9267,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $11, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_106 ; AVX512F-NEXT: ## BB#105: ## %cond.store103 @@ -9104,6 +9277,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $10, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_108 ; AVX512F-NEXT: ## BB#107: ## %cond.store105 @@ -9113,6 +9287,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $9, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_110 ; AVX512F-NEXT: ## BB#109: ## %cond.store107 @@ -9122,6 +9297,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $8, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_112 ; AVX512F-NEXT: ## BB#111: ## %cond.store109 @@ -9131,6 +9307,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $7, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_114 ; AVX512F-NEXT: ## BB#113: ## %cond.store111 @@ -9140,6 +9317,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $6, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_116 ; AVX512F-NEXT: ## BB#115: ## %cond.store113 @@ -9149,6 +9327,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $5, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_118 ; AVX512F-NEXT: ## BB#117: ## %cond.store115 @@ -9158,6 +9337,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $4, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_120 ; AVX512F-NEXT: ## BB#119: ## %cond.store117 @@ -9167,6 +9347,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $3, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_122 ; AVX512F-NEXT: ## BB#121: ## %cond.store119 @@ -9176,6 +9357,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $2, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_124 ; AVX512F-NEXT: ## BB#123: ## %cond.store121 @@ -9185,6 +9367,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: kshiftlw $1, %k1, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_126 ; AVX512F-NEXT: ## BB#125: ## %cond.store123 @@ -9193,6 +9376,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> ; AVX512F-NEXT: LBB58_126: ## %else124 ; AVX512F-NEXT: kshiftrw $15, %k1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB58_128 ; AVX512F-NEXT: ## BB#127: ## %cond.store125 @@ -9273,6 +9457,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $15, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_2 ; AVX512F-NEXT: ## BB#1: ## %cond.store @@ -9281,6 +9466,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $14, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_4 ; AVX512F-NEXT: ## BB#3: ## %cond.store1 @@ -9289,6 +9475,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $13, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_6 ; AVX512F-NEXT: ## BB#5: ## %cond.store3 @@ -9297,6 +9484,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $12, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_8 ; AVX512F-NEXT: ## BB#7: ## %cond.store5 @@ -9305,6 +9493,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $11, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_10 ; AVX512F-NEXT: ## BB#9: ## %cond.store7 @@ -9313,6 +9502,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $10, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_12 ; AVX512F-NEXT: ## BB#11: ## %cond.store9 @@ -9321,6 +9511,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $9, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_14 ; AVX512F-NEXT: ## BB#13: ## %cond.store11 @@ -9329,6 +9520,7 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB59_16 ; AVX512F-NEXT: ## BB#15: ## %cond.store13 @@ -9574,6 +9766,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $15, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_2 ; AVX512F-NEXT: ## BB#1: ## %cond.store @@ -9582,6 +9775,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $14, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_4 ; AVX512F-NEXT: ## BB#3: ## %cond.store1 @@ -9590,6 +9784,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $13, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_6 ; AVX512F-NEXT: ## BB#5: ## %cond.store3 @@ -9598,6 +9793,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $12, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_8 ; AVX512F-NEXT: ## BB#7: ## %cond.store5 @@ -9606,6 +9802,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $11, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_10 ; AVX512F-NEXT: ## BB#9: ## %cond.store7 @@ -9614,6 +9811,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $10, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_12 ; AVX512F-NEXT: ## BB#11: ## %cond.store9 @@ -9622,6 +9820,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $9, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_14 ; AVX512F-NEXT: ## BB#13: ## %cond.store11 @@ -9630,6 +9829,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $8, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_16 ; AVX512F-NEXT: ## BB#15: ## %cond.store13 @@ -9638,6 +9838,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $7, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_18 ; AVX512F-NEXT: ## BB#17: ## %cond.store15 @@ -9647,6 +9848,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $6, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_20 ; AVX512F-NEXT: ## BB#19: ## %cond.store17 @@ -9656,6 +9858,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $5, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_22 ; AVX512F-NEXT: ## BB#21: ## %cond.store19 @@ -9665,6 +9868,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $4, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_24 ; AVX512F-NEXT: ## BB#23: ## %cond.store21 @@ -9674,6 +9878,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $3, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_26 ; AVX512F-NEXT: ## BB#25: ## %cond.store23 @@ -9683,6 +9888,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $2, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_28 ; AVX512F-NEXT: ## BB#27: ## %cond.store25 @@ -9692,6 +9898,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: kshiftlw $1, %k0, %k1 ; AVX512F-NEXT: kshiftrw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_30 ; AVX512F-NEXT: ## BB#29: ## %cond.store27 @@ -9700,6 +9907,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 ; AVX512F-NEXT: LBB60_30: ## %else28 ; AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: testb %al, %al ; AVX512F-NEXT: je LBB60_32 ; AVX512F-NEXT: ## BB#31: ## %cond.store29 diff --git a/llvm/test/CodeGen/X86/pr27591.ll b/llvm/test/CodeGen/X86/pr27591.ll index 11f5de4..53a23b3 100644 --- a/llvm/test/CodeGen/X86/pr27591.ll +++ b/llvm/test/CodeGen/X86/pr27591.ll @@ -8,8 +8,9 @@ define void @test1(i32 %x) #0 { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: kmovw %ecx, %k0 +; CHECK-NEXT: movb %al, %dil +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: kmovb %k0, %eax ; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movzbl %al, %edi @@ -28,14 +29,16 @@ define void @test2(i32 %x) #0 { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: kmovw %ecx, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, %al +; CHECK-NEXT: movb %al, %dil +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kmovw %k0, %edi +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: movb %dil, %al ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: movl $-1, %edx -; CHECK-NEXT: cmovnel %edx, %edi +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %edi ; CHECK-NEXT: callq callee2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr28173.ll b/llvm/test/CodeGen/X86/pr28173.ll index 31ea4ff..81c10bb 100644 --- a/llvm/test/CodeGen/X86/pr28173.ll +++ b/llvm/test/CodeGen/X86/pr28173.ll @@ -5,13 +5,12 @@ target triple = "x86_64-unknown-linux-gnu" ; Note that the kmovs should really *not* appear in the output, this is an ; artifact of the current poor lowering. This is tracked by PR28175. +; CHECK-LABEL: @foo64 +; CHECK: kmov +; CHECK: kmov +; CHECK: orq $-2, %rax +; CHECK: ret define i64 @foo64(i1 zeroext %i, i32 %j) #0 { -; CHECK-LABEL: foo64: -; CHECK: # BB#0: -; CHECK-NEXT: # kill -; CHECK-NEXT: orq $-2, %rdi -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: retq br label %bb bb: @@ -23,12 +22,12 @@ end: ret i64 %v } +; CHECK-LABEL: @foo16 +; CHECK: kmov +; CHECK: kmov +; CHECK: orl $65534, %eax +; CHECK: retq define i16 @foo16(i1 zeroext %i, i32 %j) #0 { -; CHECK-LABEL: foo16: -; CHECK: # BB#0: -; CHECK-NEXT: orl $65534, %edi # imm = 0xFFFE -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: retq br label %bb bb: diff --git a/llvm/test/CodeGen/X86/xaluo.ll b/llvm/test/CodeGen/X86/xaluo.ll index eb0fd86..75743b0 100644 --- a/llvm/test/CodeGen/X86/xaluo.ll +++ b/llvm/test/CodeGen/X86/xaluo.ll @@ -738,10 +738,10 @@ define i1 @bug27873(i64 %c1, i1 %c2) { ; KNL-LABEL: bug27873: ; KNL: ## BB#0: ; KNL-NEXT: andl $1, %esi -; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: movl $160, %ecx ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: mulq %rcx +; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: seto %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 -- 2.7.4