From a4124e455e641db1e18d4221d2dacb31953fd13b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 12 Nov 2020 20:18:50 -0800 Subject: [PATCH] [X86] When storing v1i1/v2i1/v4i1 to memory, make sure we store zeros in the rest of the byte We can't store garbage in the unused bits. It possible that something like zextload from i1/i2/i4 is created to read the memory. Those zextloads would be legalized assuming the extra bits are 0. I'm not sure that the code in lowerStore is executed for the v1i1/v2i1/v4i1 case. It looks like the DAG combine in combineStore may have converted them to v8i1 first. And I think we're missing some cases to avoid going to the stack in the first place. But I don't have time to investigate those things at the moment so I wanted to focus on the correctness issue. Should fix PR48147. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D91294 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +++-- llvm/lib/Target/X86/X86InstrAVX512.td | 3 - .../X86/avx512-extract-subvector-load-store.ll | 52 +++++++++++++ .../test/CodeGen/X86/avx512-load-trunc-store-i1.ll | 63 ++++++++++++---- llvm/test/CodeGen/X86/avx512-mask-op.ll | 72 ++++++++++++++++++ llvm/test/CodeGen/X86/avx512-select.ll | 16 ++++ .../X86/stack-folding-avx512vp2intersect.ll | 30 +++++--- llvm/test/CodeGen/X86/vec_saddo.ll | 2 + llvm/test/CodeGen/X86/vec_smulo.ll | 87 +++++++++++----------- llvm/test/CodeGen/X86/vec_ssubo.ll | 2 + llvm/test/CodeGen/X86/vec_uaddo.ll | 4 +- llvm/test/CodeGen/X86/vec_umulo.ll | 65 ++++++++-------- llvm/test/CodeGen/X86/vec_usubo.ll | 4 +- 13 files changed, 313 insertions(+), 106 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5b0e9fa..f1956d7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23870,17 +23870,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { - assert(StoredVal.getValueType().getVectorNumElements() <= 8 && - "Unexpected VT"); + unsigned NumElts = StoredVal.getValueType().getVectorNumElements(); + assert(NumElts <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); + // We must pad with zeros to ensure we store zeroes to any unused bits. StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), StoredVal, DAG.getIntPtrConstant(0, dl)); StoredVal = DAG.getBitcast(MVT::i16, StoredVal); StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); + // Make sure we store zeros in the extra bits. + if (NumElts < 8) + StoredVal = DAG.getZeroExtendInReg(StoredVal, dl, + MVT::getIntegerVT(NumElts)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), @@ -44971,17 +44976,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR && StoredVal.getOperand(0).getValueType() == MVT::i8) { - return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0), + SDValue Val = StoredVal.getOperand(0); + // We must store zeros to the unused bits. + Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); + return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // Widen v2i1/v4i1 stores to v8i1. - if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && + if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && Subtarget.hasAVX512()) { unsigned NumConcats = 8 / VT.getVectorNumElements(); - SmallVector Ops(NumConcats, DAG.getUNDEF(VT)); + // We must store zeros to the unused bits. + SmallVector Ops(NumConcats, DAG.getConstant(0, dl, VT)); Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 3d8fbdc..339fd00 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2921,9 +2921,6 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), // Load/store kreg let Predicates = [HasDQI] in { - def : Pat<(store VK1:$src, addr:$dst), - (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; - def : Pat<(v1i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>; def : Pat<(v2i1 (load addr:$src)), diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll index 7b7ddf7..deed569 100644 --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -593,6 +593,8 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -600,6 +602,8 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -619,6 +623,8 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-NEXT: cmovel %ecx, %eax ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -632,6 +638,8 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ-NEXT: cmovel %ecx, %eax ; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -649,6 +657,8 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-NEXT: cmovel %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -660,6 +670,8 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ-NEXT: cmovel %eax, %ecx ; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -673,6 +685,8 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -680,6 +694,8 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -693,6 +709,8 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $3, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -700,6 +718,8 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -713,6 +733,8 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -720,6 +742,8 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -760,6 +784,8 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -767,6 +793,8 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -807,6 +835,8 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw (%rdi), %k0 ; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -814,6 +844,8 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -881,6 +913,8 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw (%rdi), %k0 ; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -888,6 +922,8 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -955,6 +991,8 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -962,6 +1000,8 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -1056,6 +1096,8 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $31, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -1063,6 +1105,8 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -1160,6 +1204,8 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -1167,6 +1213,8 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -1286,6 +1334,8 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $63, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -1293,6 +1343,8 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll index c3bcebe..95e2166 100644 --- a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll +++ b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll @@ -5,13 +5,18 @@ define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i2_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i2_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i2>, <1 x i2>* %a0 @@ -22,13 +27,18 @@ define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) { define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i3_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i3_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i3>, <1 x i3>* %a0 @@ -39,13 +49,18 @@ define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) { define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i4_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i4_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i4>, <1 x i4>* %a0 @@ -56,13 +71,18 @@ define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) { define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i8_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i8_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i8>, <1 x i8>* %a0 @@ -73,13 +93,18 @@ define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) { define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i16_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i16_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i16>, <1 x i16>* %a0 @@ -90,13 +115,18 @@ define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) { define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i32_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i32_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i32>, <1 x i32>* %a0 @@ -107,13 +137,18 @@ define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) { define void @load_v1i64_trunc_v1i1_store(<1 x i64>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i64_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i64_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i64>, <1 x i64>* %a0 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 8fa7fcc..5df6842 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1455,6 +1455,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) { ; KNL: ## %bb.0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1471,6 +1473,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1480,6 +1484,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1500,6 +1506,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) { ; KNL: ## %bb.0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1516,6 +1524,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1525,6 +1535,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1545,6 +1557,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; KNL: ## %bb.0: ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rsi) ; KNL-NEXT: retq @@ -1553,6 +1567,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rsi) ; SKX-NEXT: retq ; @@ -1560,6 +1576,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rsi) ; AVX512BW-NEXT: retq @@ -1568,6 +1586,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k0 ; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rsi) ; AVX512DQ-NEXT: retq ; @@ -1576,6 +1596,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: knotw %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, (%eax) ; X86-NEXT: retl %x = xor <1 x i1> %c, @@ -1588,6 +1610,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; KNL: ## %bb.0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1598,6 +1622,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 ; SKX-NEXT: vpmovq2m %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq ; @@ -1605,6 +1631,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1615,6 +1643,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0 ; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1625,6 +1655,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; X86-NEXT: vpmovq2m %xmm0, %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: knotw %k0, %k0 +; X86-NEXT: kshiftlb $6, %k0, %k0 +; X86-NEXT: kshiftrb $6, %k0, %k0 ; X86-NEXT: kmovb %k0, (%eax) ; X86-NEXT: retl %x = xor <2 x i1> %c, @@ -1637,6 +1669,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; KNL: ## %bb.0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1647,6 +1681,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vpmovd2m %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq ; @@ -1654,6 +1690,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1664,6 +1702,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1674,6 +1714,8 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; X86-NEXT: vpmovd2m %xmm0, %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: knotw %k0, %k0 +; X86-NEXT: kshiftlb $4, %k0, %k0 +; X86-NEXT: kshiftrb $4, %k0, %k0 ; X86-NEXT: kmovb %k0, (%eax) ; X86-NEXT: retl %x = xor <4 x i1> %c, @@ -5206,6 +5248,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) { ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5220,6 +5264,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) { ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SKX-NEXT: retq @@ -5229,6 +5275,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) { ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5243,6 +5291,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) { ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: retq @@ -5260,6 +5310,8 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) { ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp) ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: popl %ecx @@ -5277,6 +5329,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) { ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5291,6 +5345,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) { ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SKX-NEXT: retq @@ -5300,6 +5356,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) { ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5314,6 +5372,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) { ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: retq @@ -5331,6 +5391,8 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) { ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp) ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: popl %ecx @@ -5348,6 +5410,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) { ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5362,6 +5426,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) { ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SKX-NEXT: retq @@ -5371,6 +5437,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) { ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5385,6 +5453,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) { ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512DQ-NEXT: kandw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: retq @@ -5402,6 +5472,8 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) { ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kandw %k1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp) ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: popl %ecx diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index a60f6ee..8eadc36 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -552,6 +552,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind { ; X86-AVX512F-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512F-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512F-NEXT: korw %k1, %k0, %k0 +; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512F-NEXT: kmovw %k0, %eax ; X86-AVX512F-NEXT: movb %al, (%edx) ; X86-AVX512F-NEXT: popl %esi @@ -568,6 +570,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind { ; X64-AVX512F-NEXT: kandnw %k1, %k2, %k1 ; X64-AVX512F-NEXT: kandw %k2, %k0, %k0 ; X64-AVX512F-NEXT: korw %k1, %k0, %k0 +; X64-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512F-NEXT: kmovw %k0, %eax ; X64-AVX512F-NEXT: movb %al, (%rsi) ; X64-AVX512F-NEXT: retq @@ -587,6 +591,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind { ; X86-AVX512BW-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512BW-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 +; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512BW-NEXT: kmovd %k0, %eax ; X86-AVX512BW-NEXT: movb %al, (%edx) ; X86-AVX512BW-NEXT: popl %esi @@ -603,6 +609,8 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind { ; X64-AVX512BW-NEXT: kandnw %k1, %k2, %k1 ; X64-AVX512BW-NEXT: kandw %k2, %k0, %k0 ; X64-AVX512BW-NEXT: korw %k1, %k0, %k0 +; X64-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512BW-NEXT: kmovd %k0, %eax ; X64-AVX512BW-NEXT: movb %al, (%rsi) ; X64-AVX512BW-NEXT: retq @@ -634,6 +642,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi ; X86-AVX512F-NEXT: movzbl (%eax), %ecx ; X86-AVX512F-NEXT: kmovw %ecx, %k0 ; X86-AVX512F-NEXT: .LBB18_3: +; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512F-NEXT: kmovw %k0, %ecx ; X86-AVX512F-NEXT: movb %cl, (%eax) ; X86-AVX512F-NEXT: retl @@ -653,6 +663,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi ; X64-AVX512F-NEXT: movzbl (%rsi), %eax ; X64-AVX512F-NEXT: kmovw %eax, %k0 ; X64-AVX512F-NEXT: .LBB18_3: +; X64-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512F-NEXT: kmovw %k0, %eax ; X64-AVX512F-NEXT: movb %al, (%rsi) ; X64-AVX512F-NEXT: retq @@ -675,6 +687,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi ; X86-AVX512BW-NEXT: movzbl (%eax), %ecx ; X86-AVX512BW-NEXT: kmovd %ecx, %k0 ; X86-AVX512BW-NEXT: .LBB18_3: +; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512BW-NEXT: kmovd %k0, %ecx ; X86-AVX512BW-NEXT: movb %cl, (%eax) ; X86-AVX512BW-NEXT: retl @@ -694,6 +708,8 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi ; X64-AVX512BW-NEXT: movzbl (%rsi), %eax ; X64-AVX512BW-NEXT: kmovd %eax, %k0 ; X64-AVX512BW-NEXT: .LBB18_3: +; X64-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512BW-NEXT: kmovd %k0, %eax ; X64-AVX512BW-NEXT: movb %al, (%rsi) ; X64-AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll index 839dd1c..f921d10 100644 --- a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll @@ -89,9 +89,13 @@ define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k0, %k2 +; CHECK-NEXT: kshiftrw $12, %k2, %k2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: movb %al, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $12, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -115,9 +119,13 @@ define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k0, %k2 +; CHECK-NEXT: kshiftrw $12, %k2, %k2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: movb %al, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $12, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -140,9 +148,13 @@ define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, (%rsi) +; CHECK-NEXT: kshiftlw $14, %k0, %k2 +; CHECK-NEXT: kshiftrw $14, %k2, %k2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: movb %al, (%rsi) +; CHECK-NEXT: kshiftlw $14, %k1, %k0 +; CHECK-NEXT: kshiftrw $14, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 8197a35..8a2ed6c 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -1131,6 +1131,8 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kxorw %k2, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 1b5aef6..746f9e5 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3559,64 +3559,65 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %k1, %r9d -; AVX512-NEXT: andb $1, %r9b -; AVX512-NEXT: negb %r9b +; AVX512-NEXT: kmovd %k1, %r10d +; AVX512-NEXT: andb $1, %r10b +; AVX512-NEXT: negb %r10b ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %r10d -; AVX512-NEXT: andb $1, %r10b -; AVX512-NEXT: negb %r10b +; AVX512-NEXT: kmovd %k2, %r9d +; AVX512-NEXT: andb $1, %r9b +; AVX512-NEXT: negb %r9b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %r11d -; AVX512-NEXT: andb $1, %r11b -; AVX512-NEXT: negb %r11b +; AVX512-NEXT: kmovd %k2, %ebp +; AVX512-NEXT: andb $1, %bpl +; AVX512-NEXT: negb %bpl ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: negb %bl +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl ; AVX512-NEXT: kshiftrw $1, %k0, %k2 ; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil ; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl ; AVX512-NEXT: kmovd %k1, %eax ; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: negb %al -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl +; AVX512-NEXT: kmovd %k0, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %cl -; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: imulb %bl +; AVX512-NEXT: movl %eax, %r11d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: cmpb %r8b, %cl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: movl %r11d, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: cmpb %r11b, %bl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: movw $-3, %ax ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kandw %k0, %k1, %k1 -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: imulb %sil -; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: movl %r8d, %ecx ; AVX512-NEXT: andb $1, %cl ; AVX512-NEXT: negb %cl -; AVX512-NEXT: cmpb %dl, %cl +; AVX512-NEXT: cmpb %r8b, %cl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al @@ -3627,8 +3628,8 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: movw $-5, %ax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kandw %k1, %k2, %k2 -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: imulb %bl +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: imulb %dl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -3643,25 +3644,26 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: kshiftlw $13, %k2, %k2 ; AVX512-NEXT: kshiftrw $13, %k2, %k2 -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: imulb %r9b +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: imulb %r10b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: negb %bl -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: setne %bl -; AVX512-NEXT: orb %cl, %bl +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl +; AVX512-NEXT: cmpb %al, %dl +; AVX512-NEXT: setne %dl +; AVX512-NEXT: orb %cl, %dl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k3 ; AVX512-NEXT: kshiftlw $3, %k3, %k3 ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %r8d, %k2 +; AVX512-NEXT: andl $1, %r11d +; AVX512-NEXT: kmovw %r11d, %k2 ; AVX512-NEXT: kandw %k0, %k2, %k0 -; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kmovd %r8d, %k2 ; AVX512-NEXT: kshiftlw $15, %k2, %k2 ; AVX512-NEXT: kshiftrw $14, %k2, %k2 ; AVX512-NEXT: korw %k2, %k0, %k0 @@ -3680,6 +3682,7 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d4d4585..f9dbacc 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -1140,6 +1140,8 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kxorw %k2, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index c34653b..5954f11 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1180,7 +1180,9 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kandnw %k0, %k1, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index cc25fd5..4766fe9 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -3188,52 +3188,53 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %k1, %r9d -; AVX512-NEXT: andb $1, %r9b +; AVX512-NEXT: kmovd %k1, %r8d +; AVX512-NEXT: andb $1, %r8b ; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %r9d +; AVX512-NEXT: andb $1, %r9b +; AVX512-NEXT: kshiftrw $2, %k0, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b -; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kshiftrw $2, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r11d ; AVX512-NEXT: andb $1, %r11b -; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: andb $1, %bl ; AVX512-NEXT: kshiftrw $1, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: andb $1, %cl ; AVX512-NEXT: kshiftrw $1, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: kmovd %k1, %ecx -; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: kmovd %k1, %edx +; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %cl -; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: mulb %dl +; AVX512-NEXT: movl %eax, %edx ; AVX512-NEXT: seto %al -; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: testb $-2, %dl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: movw $-3, %ax ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kandw %k0, %k1, %k1 -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: mulb %sil -; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: movl %eax, %ebp ; AVX512-NEXT: seto %al -; AVX512-NEXT: testb $-2, %dl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: testb $-2, %bpl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k2 ; AVX512-NEXT: kshiftlw $15, %k2, %k2 @@ -3242,35 +3243,36 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: movw $-5, %ax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kandw %k1, %k2, %k2 -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: mulb %bl +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: mulb %r11b ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k3 ; AVX512-NEXT: kshiftlw $2, %k3, %k3 ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: kshiftlw $13, %k2, %k2 ; AVX512-NEXT: kshiftrw $13, %k2, %k2 -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: mulb %r10b +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: mulb %r9b ; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: seto %cl +; AVX512-NEXT: seto %bl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %bl -; AVX512-NEXT: orb %cl, %bl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %bl, %cl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k3 ; AVX512-NEXT: kshiftlw $3, %k3, %k3 ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %r8d, %k2 +; AVX512-NEXT: andl $1, %edx +; AVX512-NEXT: kmovw %edx, %k2 ; AVX512-NEXT: kandw %k0, %k2, %k0 -; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kmovd %ebp, %k2 ; AVX512-NEXT: kshiftlw $15, %k2, %k2 ; AVX512-NEXT: kshiftrw $14, %k2, %k2 ; AVX512-NEXT: korw %k2, %k0, %k0 @@ -3289,6 +3291,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 76c3e5a..afb0f6c 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -1227,7 +1227,9 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) -- 2.7.4