From f61727d8808530dc60394b99c01d4b0124a05c42 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 20 May 2015 14:32:03 +0000 Subject: [PATCH] AVX-512: fixed algorithm of building vectors of i1 elements fixed extract-insert i1 element, load i1, zextload i1 should be with "and $1, %reg" to prevent loading garbage. added a bunch of new tests. llvm-svn: 237793 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 182 +++++++++++++++++-------- llvm/lib/Target/X86/X86InstrAVX512.td | 8 +- llvm/lib/Target/X86/X86InstrCompiler.td | 9 +- llvm/test/CodeGen/X86/avx512-insert-extract.ll | 33 +++-- llvm/test/CodeGen/X86/avx512-mask-op.ll | 113 ++++++++++++++- llvm/test/CodeGen/X86/avx512-select.ll | 21 ++- llvm/test/CodeGen/X86/avx512-trunc-ext.ll | 7 +- 7 files changed, 282 insertions(+), 91 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e83e62a..964a599 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1471,6 +1471,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1500,6 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); setOperationAction(ISD::SELECT, MVT::v4i1, Custom); setOperationAction(ISD::SELECT, MVT::v2i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -5188,12 +5194,27 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return NV; } +static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) { + assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && + Op.getScalarValueSizeInBits() == 1 && + "Can not convert non-constant vector"); + uint64_t Immediate = 0; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() != ISD::UNDEF) + Immediate |= cast(In)->getZExtValue() << idx; + } + SDLoc dl(Op); + MVT VT = + MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); + return DAG.getConstant(Immediate, dl, VT); +} // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); - assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && + assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); @@ -5209,62 +5230,69 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } - bool AllContants = true; + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + SDValue Imm = ConvertI1VectorToInterger(Op, DAG); + if (Imm.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, dl, VT, Imm); + SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); + } + + // Vector has one or more non-const elements uint64_t Immediate = 0; - int NonConstIdx = -1; + SmallVector NonConstIdx; bool IsSplat = true; - unsigned NumNonConsts = 0; - unsigned NumConsts = 0; + bool HasConstElts = false; + int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; - if (!isa(In)) { - AllContants = false; - NonConstIdx = idx; - NumNonConsts++; - } else { - NumConsts++; - if (cast(In)->getZExtValue()) - Immediate |= (1ULL << idx); + if (!isa(In)) + NonConstIdx.push_back(idx); + else { + Immediate |= cast(In)->getZExtValue() << idx; + HasConstElts = true; } - if (In != Op.getOperand(0)) + if (SplatIdx == -1) + SplatIdx = idx; + else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } - if (AllContants) { - SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, - DAG.getConstant(Immediate, dl, MVT::i16)); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, - DAG.getIntPtrConstant(0, dl)); + // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" + if (IsSplat) + return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); + + // insert elements one by one + SDValue DstVec; + SDValue Imm; + if (Immediate) { + MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); + Imm = DAG.getConstant(Immediate, dl, ImmVT); + } + else if (HasConstElts) + Imm = DAG.getConstant(0, dl, VT); + else + Imm = DAG.getUNDEF(VT); + if (Imm.getValueSizeInBits() == VT.getSizeInBits()) + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, Imm); + else { + SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm); + DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); } - if (NumNonConsts == 1 && NonConstIdx != 0) { - SDValue DstVec; - if (NumConsts) { - SDValue VecAsImm = DAG.getConstant(Immediate, dl, - MVT::getIntegerVT(VT.getSizeInBits())); - DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); - } - else - DstVec = DAG.getUNDEF(VT); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, - Op.getOperand(NonConstIdx), - DAG.getIntPtrConstant(NonConstIdx, dl)); - } - if (!IsSplat && (NonConstIdx != 0)) - llvm_unreachable("Unsupported BUILD_VECTOR operation"); - MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; - SDValue Select; - if (IsSplat) - Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), - DAG.getConstant(-1, dl, SelectVT), - DAG.getConstant(0, dl, SelectVT)); - else - Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), - DAG.getConstant((Immediate | 1), dl, SelectVT), - DAG.getConstant(Immediate, dl, SelectVT)); - return DAG.getNode(ISD::BITCAST, dl, VT, Select); + for (unsigned i = 0; i < NonConstIdx.size(); ++i) { + unsigned InsertIdx = NonConstIdx[i]; + DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(InsertIdx), + DAG.getIntPtrConstant(InsertIdx, dl)); + } + return DstVec; } /// \brief Return true if \p N implements a horizontal binop and return the @@ -10670,15 +10698,11 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { unsigned IdxVal = cast(Idx)->getZExtValue(); SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (IdxVal) + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); - const TargetRegisterClass* rc = getRegClassFor(VecVT); - unsigned MaxSift = rc->getSize()*8 - 1; - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, - DAG.getConstant(MaxSift, dl, MVT::i8)); - EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, - DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); + return EltInVec; return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } @@ -13623,6 +13647,29 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } + if (VT.isVector() && VT.getScalarType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInterger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInterger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, DL, VT, newSelect); + SDValue ExtVec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); + } + } + if (VT == MVT::v4i1 || VT == MVT::v2i1) { SDValue zeroConst = DAG.getIntPtrConstant(0, DL); Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, @@ -20728,7 +20775,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + if (!BCVT.isVector() || + BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); } @@ -20833,7 +20881,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return NewOp; SDValue InputVector = N->getOperand(0); - + SDLoc dl(InputVector); // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && N->getValueType(0) == MVT::i32 && @@ -20858,6 +20906,18 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, MMXSrcOp.getOperand(0)); } + EVT VT = N->getValueType(0); + + if (VT == MVT::i1 && dyn_cast(N->getOperand(1)) && + InputVector.getOpcode() == ISD::BITCAST && + dyn_cast(InputVector.getOperand(0))) { + uint64_t ExtractedElt = + cast(N->getOperand(1))->getZExtValue(); + uint64_t InputValue = + cast(InputVector.getOperand(0))->getZExtValue(); + uint64_t Res = (InputValue >> ExtractedElt) & 1; + return DAG.getConstant(Res, dl, MVT::i1); + } // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) @@ -20903,7 +20963,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // otherwise bounce the vector off the cache. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vals[4]; - SDLoc dl(InputVector); if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); @@ -23606,6 +23665,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc dl(N); // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) @@ -23613,7 +23673,6 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && VT == MVT::i32) { - SDLoc dl(N); SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, N0.getOperand(0), N0.getOperand(1)); @@ -23621,8 +23680,15 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, return R.getValue(1); } - if (!DCI.isBeforeLegalizeOps()) + if (!DCI.isBeforeLegalizeOps()) { + if (N0.getValueType() == MVT::i1) { + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue AllOnes = + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT); + return DAG.getNode(ISD::SELECT, dl, VT, N0, AllOnes, Zero); + } return SDValue(); + } if (!Subtarget->hasFp256()) return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1ce25ae..39f45fc 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1855,7 +1855,9 @@ let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; def : Pat<(i1 (load addr:$src)), - (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; + (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0), + (MOV8rm addr:$src), sub_8bit)), + (i16 1)), VK1)>; def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), (KMOVWkm addr:$src)>; } @@ -1920,13 +1922,13 @@ let Predicates = [HasAVX512, NoDQI] in { // GR from/to 8-bit mask without native support def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), (COPY_TO_REGCLASS - (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), - VK8)>; + (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>; def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), sub_8bit)>; } + let Predicates = [HasAVX512] in { def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 24d5d22..912a0fb 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1064,11 +1064,12 @@ defm : CMOVmr; defm : CMOVmr; // zextload bool -> zextload byte -def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; -def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; -def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>; +def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>; +def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>; def : Pat<(zextloadi64i1 addr:$src), - (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + (SUBREG_TO_REG (i64 0), + (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 6498b20..6f985f0 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -137,10 +137,12 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { } ;CHECK-LABEL: test13 -;CHECK: cmpl -;CHECK: sbbl -;CHECK: orl $65532 -;CHECK: ret +;CHECK: cmpl %esi, %edi +;CHECK: setb %al +;CHECK: andl $1, %eax +;CHECK: kmovw %eax, %k0 +;CHECK: movw $-4 +;CHECK: korw define i16 @test13(i32 %a, i32 %b) { %cmp_res = icmp ult i32 %a, %b %maskv = insertelement <16 x i1> , i1 %cmp_res, i32 0 @@ -167,19 +169,22 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { } ;CHECK-LABEL: test15 -;CHECK: kshiftlw -;CHECK: kmovw -;CHECK: ret +;CHECK: movb (%rdi), %al +;CHECK: andb $1, %al +;CHECK: movw $-1, %ax +;CHECK: cmovew define i16 @test15(i1 *%addr) { - %x = load i1 , i1 * %addr, align 128 + %x = load i1 , i1 * %addr, align 1 %x1 = insertelement <16 x i1> undef, i1 %x, i32 10 %x2 = bitcast <16 x i1>%x1 to i16 ret i16 %x2 } ;CHECK-LABEL: test16 -;CHECK: kshiftlw -;CHECK: kshiftrw +;CHECK: movb (%rdi), %al +;CHECK: andw $1, %ax +;CHECK: kmovw +;CHECK: kshiftlw $10 ;CHECK: korw ;CHECK: ret define i16 @test16(i1 *%addr, i16 %a) { @@ -191,11 +196,11 @@ define i16 @test16(i1 *%addr, i16 %a) { } ;CHECK-LABEL: test17 -;KNL: kshiftlw -;KNL: kshiftrw +;KNL: movb (%rdi), %al +;KNL: andw $1, %ax +;KNL: kshiftlw $4 ;KNL: korw -;SKX: kshiftlb -;SKX: kshiftrb +;SKX: kshiftlb $4 ;SKX: korb ;CHECK: ret define i8 @test17(i1 *%addr, i8 %a) { diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 6e12289..eb4f7ec 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -191,7 +191,7 @@ false: ; SKX-LABEL: test7 ; SKX: vpmovw2m -; SKX: kmovw %eax, %k1 +; SKX: kmovb %eax, %k1 ; SKX: korb define void @test7(<8 x i1> %mask) { @@ -282,3 +282,114 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) { ret <4 x i1>%c } +; KNL-LABEL: test12 +; KNL: movl %edi, %eax +define i32 @test12(i32 %x, i32 %y) { + %a = bitcast i16 21845 to <16 x i1> + %b = extractelement <16 x i1> %a, i32 0 + %c = select i1 %b, i32 %x, i32 %y + ret i32 %c +} + +; KNL-LABEL: test13 +; KNL: movl %esi, %eax +define i32 @test13(i32 %x, i32 %y) { + %a = bitcast i16 21845 to <16 x i1> + %b = extractelement <16 x i1> %a, i32 3 + %c = select i1 %b, i32 %x, i32 %y + ret i32 %c +} + +; SKX-LABEL: test14 +; SKX: movb $11, %al +; SKX: kmovb %eax, %k0 +; SKX: vpmovm2d %k0, %xmm0 + +define <4 x i1> @test14() { + %a = bitcast i16 21845 to <16 x i1> + %b = extractelement <16 x i1> %a, i32 2 + %c = insertelement <4 x i1> , i1 %b, i32 1 + ret <4 x i1> %c +} + +; KNL-LABEL: test15 +; KNL: cmovgw +define <16 x i1> @test15(i32 %x, i32 %y) { + %a = bitcast i16 21845 to <16 x i1> + %b = bitcast i16 1 to <16 x i1> + %mask = icmp sgt i32 %x, %y + %c = select i1 %mask, <16 x i1> %a, <16 x i1> %b + ret <16 x i1> %c +} + +; SKX-LABEL: test16 +; SKX: kxnorw %k1, %k1, %k1 +; SKX: kshiftrw $15, %k1, %k1 +; SKX: kshiftlq $5, %k1, %k1 +; SKX: korq %k1, %k0, %k0 +; SKX: vpmovm2b %k0, %zmm0 +define <64 x i8> @test16(i64 %x) { + %a = bitcast i64 %x to <64 x i1> + %b = insertelement <64 x i1>%a, i1 true, i32 5 + %c = sext <64 x i1>%b to <64 x i8> + ret <64 x i8>%c +} + +; SKX-LABEL: test17 +; SKX: setg %al +; SKX: andl $1, %eax +; SKX: kmovw %eax, %k1 +; SKX: kshiftlq $5, %k1, %k1 +; SKX: korq %k1, %k0, %k0 +; SKX: vpmovm2b %k0, %zmm0 +define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { + %a = bitcast i64 %x to <64 x i1> + %b = icmp sgt i32 %y, %z + %c = insertelement <64 x i1>%a, i1 %b, i32 5 + %d = sext <64 x i1>%c to <64 x i8> + ret <64 x i8>%d +} + +; KNL-LABEL: test18 +define <8 x i1> @test18(i8 %a, i16 %y) { + %b = bitcast i8 %a to <8 x i1> + %b1 = bitcast i16 %y to <16 x i1> + %el1 = extractelement <16 x i1>%b1, i32 8 + %el2 = extractelement <16 x i1>%b1, i32 9 + %c = insertelement <8 x i1>%b, i1 %el1, i32 7 + %d = insertelement <8 x i1>%c, i1 %el2, i32 6 + ret <8 x i1>%d +} + +; KNL-LABEL: test19 +; KNL: movzbl %dil, %eax +; KNL: kmovw %eax, %k0 +; KNL: kshiftlw $13, %k0, %k0 +; KNL: kshiftrw $15, %k0, %k0 +; KNL: kmovw %k0, %eax +; KNL: andl $1, %eax +; KNL: testb %al, %al + +define <8 x i1> @test19(i8 %a) { + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> + ret <8 x i1> %c +} + +; KNL-LABEL: test20 +; KNL: movzbl %dil, %eax +; KNL: kmovw %eax, %k0 +; KNL: kshiftlw $13, %k0, %k1 +; KNL: kshiftrw $15, %k1, %k1 +; KNL: kshiftlw $12, %k0, %k0 +; KNL: kshiftrw $15, %k0, %k0 +; KNL: kshiftlw $4, %k0, %k0 +; KNL: kshiftlw $1, %k1, %k2 +; KNL: korw %k0, %k2, %k0 +; KNL: kshiftlw $6, %k1, %k1 +; KNL: korw %k1, %k0, %k1 +define <8 x i1> @test20(i8 %a, i16 %y) { + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> + ret <8 x i1> %c +} diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 0dbf286..b92e6f6 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -50,8 +50,10 @@ define <16 x double> @select04(<16 x double> %a, <16 x double> %b) { } ; CHECK-LABEL: select05 -; CHECK: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK: movzbl %sil, %eax +; CHECK: kmovw %eax, %k0 +; CHECK: movzbl %dil, %eax +; CHECK: kmovw %eax, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax define i8 @select05(i8 %a.0, i8 %m) { @@ -63,8 +65,10 @@ define i8 @select05(i8 %a.0, i8 %m) { } ; CHECK-LABEL: select06 -; CHECK: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK: movzbl %sil, %eax +; CHECK: kmovw %eax, %k0 +; CHECK: movzbl %dil, %eax +; CHECK: kmovw %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax define i8 @select06(i8 %a.0, i8 %m) { @@ -76,9 +80,12 @@ define i8 @select06(i8 %a.0, i8 %m) { } ; CHECK-LABEL: select07 -; CHECK-DAG: kmovw %edx, %k0 -; CHECK-DAG: kmovw %edi, %k1 -; CHECK-DAG: kmovw %esi, %k2 +; CHECK-DAG: movzbl %dl, %eax +; CHECK-DAG: kmovw %eax, %k0 +; CHECK-DAG: movzbl %dil, %eax +; CHECK-DAG: kmovw %eax, %k1 +; CHECK-DAG: movzbl %sil, %eax +; CHECK-DAG: kmovw %eax, %k2 ; CHECK: kandw %k0, %k1, %k1 ; CHECK-NEXT: knotw %k0, %k0 ; CHECK-NEXT: kandw %k0, %k2, %k0 diff --git a/llvm/test/CodeGen/X86/avx512-trunc-ext.ll b/llvm/test/CodeGen/X86/avx512-trunc-ext.ll index 560d968..bb36a13 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc-ext.ll @@ -156,10 +156,9 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { } ; CHECK-LABEL: trunc_i32_to_i1 -; CHECK: testb -; CHECK: setne -; CKECK: orl -; CHECK: ret +; CHECK: movw $-4, %ax +; CHECK: kmovw %eax, %k1 +; CKECK: korw define i16 @trunc_i32_to_i1(i32 %a) { %a_i = trunc i32 %a to i1 %maskv = insertelement <16 x i1> , i1 %a_i, i32 0 -- 2.7.4