From: David Green Date: Wed, 24 Jul 2019 11:51:36 +0000 (+0000) Subject: [ARM] MVE predicate register support X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c7e55d4f5213d27456c8a2f74ee5284181067d73;p=platform%2Fupstream%2Fllvm.git [ARM] MVE predicate register support This adds support code for building and shuffling i1 predicate registers. It generally uses two basic principles, either converting the predicate into an scalar (through a PREDICATE_CAST) and doing scalar operations on it there, or by converting the register to an full vector register and back. Some of the code here is a not super efficient but will hopefully cover most cases of moving i1 vectors around and can be improved in subsequent patches. Some code by David Sherwood. Differential Revision: https://reviews.llvm.org/D65052 llvm-svn: 366890 --- diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 323e900..9b9ae0b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -340,6 +340,13 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; for (auto VT : pTypes) { addRegisterClass(VT, &ARM::VCCRRegClass); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); } } @@ -1505,6 +1512,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; + case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; case ARMISD::VCNE: return "ARMISD::VCNE"; @@ -6714,6 +6722,54 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, return SDValue(); } +static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BoolMask; + unsigned BitsPerBool; + if (NumElts == 4) { + BitsPerBool = 4; + BoolMask = 0xf; + } else if (NumElts == 8) { + BitsPerBool = 2; + BoolMask = 0x3; + } else if (NumElts == 16) { + BitsPerBool = 1; + BoolMask = 0x1; + } else + return SDValue(); + + // First create base with bits set where known + unsigned Bits32 = 0; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (!isa(V) && !V.isUndef()) + continue; + bool BitSet = V.isUndef() ? false : cast(V)->getZExtValue(); + if (BitSet) + Bits32 |= BoolMask << (i * BitsPerBool); + } + + // Add in unknown nodes + // FIXME: Handle splats of the same value better. + SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, + DAG.getConstant(Bits32, dl, MVT::i32)); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (isa(V) || V.isUndef()) + continue; + Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, + DAG.getConstant(i, dl, MVT::i32)); + } + + return Base; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -6722,6 +6778,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); EVT VT = Op.getValueType(); + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerBUILD_VECTOR_i1(Op, DAG, ST); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -7327,6 +7386,93 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, DAG.getConstant(ExtractNum, DL, MVT::i32)); } +static EVT getVectorTyFromPredicateVector(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4i1: + return MVT::v4i32; + case MVT::v8i1: + return MVT::v8i16; + case MVT::v16i1: + return MVT::v16i8; + default: + llvm_unreachable("Unexpected vector predicate type"); + } +} + +static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, + SelectionDAG &DAG) { + // Converting from boolean predicates to integers involves creating a vector + // of all ones or all zeroes and selecting the lanes based upon the real + // predicate. + SDValue AllOnes = + DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); + AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); + + SDValue AllZeroes = + DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); + AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); + + // Get full vector type from predicate type + EVT NewVT = getVectorTyFromPredicateVector(VT); + + SDValue RecastV1; + // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast + // this to a v16i1. This cannot be done with an ordinary bitcast because the + // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, + // since we know in hardware the sizes are really the same. + if (VT != MVT::v16i1) + RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); + else + RecastV1 = Pred; + + // Select either all ones or zeroes depending upon the real predicate bits. + SDValue PredAsVector = + DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); + + // Recast our new predicate-as-integer v16i8 vector into something + // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. + return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); +} + +static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast(Op.getNode()); + ArrayRef ShuffleMask = SVN->getMask(); + + assert(ST->hasMVEIntegerOps() && + "No support for vector shuffle of boolean predicates"); + + SDValue V1 = Op.getOperand(0); + SDLoc dl(Op); + if (isReverseMask(ShuffleMask, VT)) { + SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); + SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); + } + + // Until we can come up with optimised cases for every single vector + // shuffle in existence we have chosen the least painful strategy. This is + // to essentially promote the boolean predicate to a 8-bit integer, where + // each predicate represents a byte. Then we fall back on a normal integer + // vector shuffle and convert the result back into a predicate vector. In + // many cases the generated code might be even better than scalar code + // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit + // fields in a register into 8 other arbitrary 2-bit fields! + SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); + EVT NewVT = PredAsVector.getValueType(); + + // Do the shuffle! + SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, + DAG.getUNDEF(NewVT), ShuffleMask); + + // Now return the result of comparing the shuffled vector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCNEZ, dl, VT, Shuffled); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -7334,6 +7480,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); + unsigned EltSize = VT.getScalarSizeInBits(); + + if (ST->hasMVEIntegerOps() && EltSize == 1) + return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again @@ -7343,7 +7493,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // of the same time so that they get CSEd properly. ArrayRef ShuffleMask = SVN->getMask(); - unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize <= 32) { if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); @@ -7513,8 +7662,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue ARMTargetLowering:: -LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + + assert(ST->hasMVEIntegerOps() && + "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); + + SDValue Conv = + DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); + unsigned Lane = cast(Op.getOperand(2))->getZExtValue(); + unsigned LaneWidth = + getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; + unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, + Op.getOperand(1), DAG.getValueType(MVT::i1)); + SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, + DAG.getConstant(~Mask, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); +} + +SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa(Lane)) @@ -7522,6 +7692,11 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Elt = Op.getOperand(1); EVT EltVT = Elt.getValueType(); + + if (Subtarget->hasMVEIntegerOps() && + Op.getValueType().getScalarSizeInBits() == 1) + return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); + if (getTypeAction(*DAG.getContext(), EltVT) == TargetLowering::TypePromoteFloat) { // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, @@ -7550,13 +7725,37 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Op; } -static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + + assert(ST->hasMVEIntegerOps() && + "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); + + SDValue Conv = + DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); + unsigned Lane = cast(Op.getOperand(1))->getZExtValue(); + unsigned LaneWidth = + getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; + SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, + DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); + return Shift; +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); if (!isa(Lane)) return SDValue(); SDValue Vec = Op.getOperand(0); + EVT VT = Vec.getValueType(); + + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); + if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); @@ -7565,7 +7764,63 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + EVT Op2VT = V2.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + assert(Op1VT == Op2VT && "Operand types don't match!"); + assert(VT.getScalarSizeInBits() == 1 && + "Unexpected custom CONCAT_VECTORS lowering"); + assert(ST->hasMVEIntegerOps() && + "CONCAT_VECTORS lowering only supported for MVE"); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); + + // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + // Extract the vector elements from Op1 and Op2 one by one and truncate them + // to be the right size for the destination. For example, if Op1 is v4i1 then + // the promoted vector is v4i32. The result of concatentation gives a v8i1, + // which when promoted is v8i16. That means each i32 element from Op1 needs + // truncating to i16 and inserting in the result. + EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); + SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); + auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { + EVT NewVT = NewV.getValueType(); + EVT ConcatVT = ConVec.getValueType(); + for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, + DAG.getIntPtrConstant(i, dl)); + ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + return ConVec; + }; + unsigned j = 0; + ConVec = ExractInto(NewV1, ConVec, j); + ConVec = ExractInto(NewV2, ConVec, j); + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCNEZ, dl, VT, ConVec); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = Op->getValueType(0); + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerCONCAT_VECTORS_i1(Op, DAG, ST); + // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && @@ -7585,6 +7840,42 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned Index = cast(V2)->getZExtValue(); + + assert(VT.getScalarSizeInBits() == 1 && + "Unexpected custom EXTRACT_SUBVECTOR lowering"); + assert(ST->hasMVEIntegerOps() && + "EXTRACT_SUBVECTOR lowering only supported for MVE"); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + + // We now have Op1 promoted to a vector of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + EVT SubVT = MVT::getVectorVT(ElType, NumElts); + SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); + for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, + DAG.getIntPtrConstant(i, dl)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCNEZ, dl, VT, SubVec); +} + /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. @@ -7942,7 +8233,8 @@ static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, return N0; } -static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); @@ -7969,7 +8261,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); - N0 = LowerCONCAT_VECTORS(N0, DAG); + N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; @@ -7977,7 +8269,8 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { return LowerSDIV_v4i16(N0, N1, dl, DAG); } -static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && @@ -8005,7 +8298,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); - N0 = LowerCONCAT_VECTORS(N0, DAG); + N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, @@ -8476,19 +8769,20 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ true); - return LowerSDIV(Op, DAG); + return LowerSDIV(Op, DAG, Subtarget); case ISD::UDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ false); - return LowerUDIV(Op, DAG); + return LowerUDIV(Op, DAG, Subtarget); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::SADDO: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 983387c..0015f41 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -129,6 +129,8 @@ class VectorType; LOOP_DEC, // Really a part of LE, performs the sub LE, // Low-overhead loops, Loop End + PREDICATE_CAST, // Predicate cast for MVE i1 types + VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCNE, // Vector compare not equal (MVE) diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index e9af0ed..8675cdb 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3016,6 +3016,26 @@ let Predicates = [HasMVEInt] in { defm MVE_VCGEU : unpred_vcmp_r; } +// Occasionally we need to cast between a i32 and a boolean vector, for +// example when moving between rGPR and VPR.P0 as part of predicate vector +// shuffles. We also sometimes need to cast between different predicate +// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. + +def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; + +let Predicates = [HasMVEInt] in { + foreach VT = [ v4i1, v8i1, v16i1 ] in { + def : Pat<(i32 (predicate_cast (VT VCCR:$src))), + (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>; + def : Pat<(VT (predicate_cast (i32 VCCR:$src))), + (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>; + + foreach VT2 = [ v4i1, v8i1, v16i1 ] in + def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), + (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; + } +} + // end of MVE compares // start of MVE_qDest_qSrc @@ -4410,6 +4430,37 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + + def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>; + def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + + def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; } def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary, diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll new file mode 100644 index 0000000..959b162 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-const.ll @@ -0,0 +1,196 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + + +define arm_aapcs_vfpcc <4 x i32> @build_true_v4i1(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_true_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %s = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <4 x i32> @build_false_v4i1(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_false_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <4 x i32> @build_upper_v4i1(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_upper_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov.w r0, #65280 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <4 x i32> @build_lower_v4i1(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_lower_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r0, #255 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + + +define arm_aapcs_vfpcc <8 x i16> @build_true_v8i1(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_true_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %s = select <8 x i1> , <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <8 x i16> @build_false_v8i1(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_false_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <8 x i1> , <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <8 x i16> @build_upper_v8i1(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_upper_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov.w r0, #65280 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <8 x i1> , <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <8 x i16> @build_lower_v8i1(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_lower_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r0, #255 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <8 x i1> , <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + + +define arm_aapcs_vfpcc <16 x i8> @build_true_v16i1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_true_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %s = select <16 x i1> , <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define arm_aapcs_vfpcc <16 x i8> @build_false_v16i1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_false_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <16 x i1> , <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define arm_aapcs_vfpcc <16 x i8> @build_upper_v16i1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_upper_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov.w r0, #65280 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <16 x i1> , <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define arm_aapcs_vfpcc <16 x i8> @build_lower_v16i1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_lower_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r0, #255 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <16 x i1> , <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + + +define arm_aapcs_vfpcc <2 x i64> @build_true_v2i1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_true_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %s = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %s +} + +define arm_aapcs_vfpcc <2 x i64> @build_false_v2i1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_false_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %s = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %s +} + +define arm_aapcs_vfpcc <2 x i64> @build_upper_v2i1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_upper_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r0, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +entry: + %s = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %s +} + +define arm_aapcs_vfpcc <2 x i64> @build_lower_v2i1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_lower_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r0, .LCPI15_0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %s = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %s +} diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll new file mode 100644 index 0000000..b8b518c --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll @@ -0,0 +1,289 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + + +define arm_aapcs_vfpcc <4 x i32> @build_var0_v4i1(i32 %s, i32 %t, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_var0_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #0, #4 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <4 x i1> zeroinitializer, i1 %c, i64 0 + %r = select <4 x i1> %vc, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @build_var3_v4i1(i32 %s, i32 %t, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_var3_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #12, #4 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <4 x i1> zeroinitializer, i1 %c, i64 3 + %r = select <4 x i1> %vc, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @build_varN_v4i1(i32 %s, i32 %t, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: build_varN_v4i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #0, #4 +; CHECK-NEXT: bfi r2, r0, #4, #4 +; CHECK-NEXT: bfi r2, r0, #8, #4 +; CHECK-NEXT: bfi r2, r0, #12, #4 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc1 = insertelement <4 x i1> undef, i1 %c, i64 0 + %vc4 = shufflevector <4 x i1> %vc1, <4 x i1> undef, <4 x i32> zeroinitializer + %r = select <4 x i1> %vc4, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %r +} + + +define arm_aapcs_vfpcc <8 x i16> @build_var0_v8i1(i32 %s, i32 %t, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_var0_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #0, #2 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <8 x i1> zeroinitializer, i1 %c, i64 0 + %r = select <8 x i1> %vc, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @build_var3_v8i1(i32 %s, i32 %t, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_var3_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #6, #2 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <8 x i1> zeroinitializer, i1 %c, i64 3 + %r = select <8 x i1> %vc, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @build_varN_v8i1(i32 %s, i32 %t, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: build_varN_v8i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #0, #2 +; CHECK-NEXT: bfi r2, r0, #2, #2 +; CHECK-NEXT: bfi r2, r0, #4, #2 +; CHECK-NEXT: bfi r2, r0, #6, #2 +; CHECK-NEXT: bfi r2, r0, #8, #2 +; CHECK-NEXT: bfi r2, r0, #10, #2 +; CHECK-NEXT: bfi r2, r0, #12, #2 +; CHECK-NEXT: bfi r2, r0, #14, #2 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc1 = insertelement <8 x i1> undef, i1 %c, i64 0 + %vc4 = shufflevector <8 x i1> %vc1, <8 x i1> undef, <8 x i32> zeroinitializer + %r = select <8 x i1> %vc4, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %r +} + + +define arm_aapcs_vfpcc <16 x i8> @build_var0_v16i1(i32 %s, i32 %t, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_var0_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #0, #1 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <16 x i1> zeroinitializer, i1 %c, i64 0 + %r = select <16 x i1> %vc, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @build_var3_v16i1(i32 %s, i32 %t, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_var3_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #3, #1 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <16 x i1> zeroinitializer, i1 %c, i64 3 + %r = select <16 x i1> %vc, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @build_varN_v16i1(i32 %s, i32 %t, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: build_varN_v16i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r2, r0, #0, #1 +; CHECK-NEXT: bfi r2, r0, #1, #1 +; CHECK-NEXT: bfi r2, r0, #2, #1 +; CHECK-NEXT: bfi r2, r0, #3, #1 +; CHECK-NEXT: bfi r2, r0, #4, #1 +; CHECK-NEXT: bfi r2, r0, #5, #1 +; CHECK-NEXT: bfi r2, r0, #6, #1 +; CHECK-NEXT: bfi r2, r0, #7, #1 +; CHECK-NEXT: bfi r2, r0, #8, #1 +; CHECK-NEXT: bfi r2, r0, #9, #1 +; CHECK-NEXT: bfi r2, r0, #10, #1 +; CHECK-NEXT: bfi r2, r0, #11, #1 +; CHECK-NEXT: bfi r2, r0, #12, #1 +; CHECK-NEXT: bfi r2, r0, #13, #1 +; CHECK-NEXT: bfi r2, r0, #14, #1 +; CHECK-NEXT: bfi r2, r0, #15, #1 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc1 = insertelement <16 x i1> undef, i1 %c, i64 0 + %vc4 = shufflevector <16 x i1> %vc1, <16 x i1> undef, <16 x i32> zeroinitializer + %r = select <16 x i1> %vc4, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %r +} + + +define arm_aapcs_vfpcc <2 x i64> @build_var0_v2i1(i32 %s, i32 %t, <2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_var0_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r2, #1 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: vmov s8, r0 +; CHECK-NEXT: vldr s10, .LCPI9_0 +; CHECK-NEXT: vmov.f32 s9, s8 +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .long 0 @ float 0 +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <2 x i1> zeroinitializer, i1 %c, i64 0 + %r = select <2 x i1> %vc, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @build_var1_v2i1(i32 %s, i32 %t, <2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_var1_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r2, #1 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: vmov s10, r0 +; CHECK-NEXT: vldr s8, .LCPI10_0 +; CHECK-NEXT: vmov.f32 s9, s8 +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .long 0 @ float 0 +entry: + %c = icmp ult i32 %s, %t + %vc = insertelement <2 x i1> zeroinitializer, i1 %c, i64 1 + %r = select <2 x i1> %vc, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @build_varN_v2i1(i32 %s, i32 %t, <2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: build_varN_v2i1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r2, #1 +; CHECK-NEXT: rsbs r0, r2, #0 +; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp ult i32 %s, %t + %vc1 = insertelement <2 x i1> undef, i1 %c, i64 0 + %vc4 = shufflevector <2 x i1> %vc1, <2 x i1> undef, <2 x i32> zeroinitializer + %r = select <2 x i1> %vc4, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %r +} diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll new file mode 100644 index 0000000..e7cc467 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @sext_v4i1_v4i32(<4 x i32> %src) { +; CHECK-LABEL: sext_v4i1_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %src, zeroinitializer + %0 = sext <4 x i1> %c to <4 x i32> + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @sext_v8i1_v8i16(<8 x i16> %src) { +; CHECK-LABEL: sext_v8i1_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: vmov.i16 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %src, zeroinitializer + %0 = sext <8 x i1> %c to <8 x i16> + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @sext_v16i1_v16i8(<16 x i8> %src) { +; CHECK-LABEL: sext_v16i1_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s8 gt, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <16 x i8> %src, zeroinitializer + %0 = sext <16 x i1> %c to <16 x i8> + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) { +; CHECK-LABEL: sext_v2i1_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <2 x i64> %src, zeroinitializer + %0 = sext <2 x i1> %c to <2 x i64> + ret <2 x i64> %0 +} + + +define arm_aapcs_vfpcc <4 x i32> @zext_v4i1_v4i32(<4 x i32> %src) { +; CHECK-LABEL: zext_v4i1_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s32 gt, q0, zr +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <4 x i32> %src, zeroinitializer + %0 = zext <4 x i1> %c to <4 x i32> + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @zext_v8i1_v8i16(<8 x i16> %src) { +; CHECK-LABEL: zext_v8i1_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s16 gt, q0, zr +; CHECK-NEXT: vmov.i16 q0, #0x0 +; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <8 x i16> %src, zeroinitializer + %0 = zext <8 x i1> %c to <8 x i16> + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @zext_v16i1_v16i8(<16 x i8> %src) { +; CHECK-LABEL: zext_v16i1_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmp.s8 gt, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %c = icmp sgt <16 x i8> %src, zeroinitializer + %0 = zext <16 x i1> %c to <16 x i8> + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) { +; CHECK-LABEL: zext_v2i1_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: adr r0, .LCPI7_0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = icmp sgt <2 x i64> %src, zeroinitializer + %0 = zext <2 x i1> %c to <2 x i64> + ret <2 x i64> %0 +} diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll new file mode 100644 index 0000000..895c91e --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -0,0 +1,564 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + +define <4 x i32> @shuffle1_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle1_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: rbit r0, r0 +; CHECK-NEXT: lsrs r0, r0, #16 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <4 x i32> %src, zeroinitializer + %sh = shufflevector <4 x i1> %c, <4 x i1> undef, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <8 x i16> @shuffle1_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle1_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: rbit r0, r0 +; CHECK-NEXT: lsrs r0, r0, #16 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %sh = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <16 x i8> @shuffle1_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle1_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: rbit r0, r0 +; CHECK-NEXT: lsrs r0, r0, #16 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %sh = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define <4 x i32> @shuffle2_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle2_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <4 x i32> %src, zeroinitializer + %sh = shufflevector <4 x i1> %c, <4 x i1> undef, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <8 x i16> @shuffle2_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle2_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %sh = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <16 x i8> @shuffle2_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle2_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %sh = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define <4 x i32> @shuffle3_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle3_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.32 r0, q0[0] +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <4 x i32> %src, zeroinitializer + %sh = shufflevector <4 x i1> %c, <4 x i1> undef, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <8 x i16> @shuffle3_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle3_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %sh = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <16 x i8> @shuffle3_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle3_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vdup.8 q0, r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i8 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %sh = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define <4 x i32> @shuffle4_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle4_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <4 x i32> %src, zeroinitializer + %sh = shufflevector <4 x i1> %c, <4 x i1> undef, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <8 x i16> @shuffle4_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle4_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i16 ne, q1, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %sh = shufflevector <8 x i1> %c, <8 x i1> undef, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <16 x i8> @shuffle4_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle4_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vdup.8 q1, r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i8 ne, q1, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %sh = shufflevector <16 x i1> %c, <16 x i1> undef, <16 x i32> + %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define <4 x i32> @shuffle5_b_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle5_b_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %sh = shufflevector <8 x i1> %c, <8 x i1> undef, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <4 x i32> @shuffle5_t_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle5_t_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %sh = shufflevector <8 x i1> %c, <8 x i1> undef, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <8 x i16> @shuffle5_b_v8i16(<16 x i8> %src, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle5_b_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q1, q1, q0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %sh = shufflevector <16 x i1> %c, <16 x i1> undef, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <8 x i16> @shuffle5_t_v8i16(<16 x i8> %src, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle5_t_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q1, q1, q0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %sh = shufflevector <16 x i1> %c, <16 x i1> undef, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <8 x i16> @shuffle6_v4i32(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle6_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vpsel q3, q2, q1 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp eq <4 x i32> %src1, zeroinitializer + %c2 = icmp eq <4 x i32> %src2, zeroinitializer + %sh = shufflevector <4 x i1> %c1, <4 x i1> %c2, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <16 x i8> @shuffle6_v8i16(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle6_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vpsel q3, q2, q1 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vcmp.i16 eq, q3, zr +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vcmp.i8 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp eq <8 x i16> %src1, zeroinitializer + %c2 = icmp eq <8 x i16> %src2, zeroinitializer + %sh = shufflevector <8 x i1> %c1, <8 x i1> %c2, <16 x i32> + %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll b/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll new file mode 100644 index 0000000..d9c7f38 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-pred-spill.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + +declare arm_aapcs_vfpcc <4 x i32> @ext_i32() +declare arm_aapcs_vfpcc <8 x i16> @ext_i16() +declare arm_aapcs_vfpcc <16 x i8> @ext_i8() + +define arm_aapcs_vfpcc <4 x i32> @shuffle1_v4i32(<4 x i32> %src, <4 x i32> %a) { +; CHECK-LABEL: shuffle1_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bl ext_i32 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpsel q0, q4, q0 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +entry: + %c = icmp eq <4 x i32> %src, zeroinitializer + %ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32() + %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %ext + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle1_v8i16(<8 x i16> %src, <8 x i16> %a) { +; CHECK-LABEL: shuffle1_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bl ext_i16 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpsel q0, q4, q0 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16() + %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %ext + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle1_v16i8(<16 x i8> %src, <16 x i8> %a) { +; CHECK-LABEL: shuffle1_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vcmp.i8 eq, q0, zr +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bl ext_i8 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpsel q0, q4, q0 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8() + %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %ext + ret <16 x i8> %s +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll index 79535ea..a704b4e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll @@ -447,3 +447,107 @@ entry: %s = select <2 x i1> %c, <2 x i32> %a, <2 x i32> %b ret <2 x i32> %s } + +define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: vcmp_multi_v2i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov lr, s10 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: subs r1, r0, r2 +; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: sbcs.w r1, r12, r2, asr #31 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r1, #-1 +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs.w r2, r1, lr +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r2, r12, lr, asr #31 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r3, #-1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r1, #-1 +; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vand q1, q5, q4 +; CHECK-NEXT: vand q1, q3, q1 +; CHECK-NEXT: vbic q0, q0, q1 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} + %a4 = icmp eq <2 x i64> %a, zeroinitializer + %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c + %a6 = icmp ne <2 x i32> %b, zeroinitializer + %a7 = icmp slt <2 x i32> %a5, %c + %a8 = icmp ne <2 x i32> %a5, zeroinitializer + %a9 = and <2 x i1> %a6, %a8 + %a10 = and <2 x i1> %a7, %a9 + %a11 = select <2 x i1> %a10, <2 x i32> %c, <2 x i32> %a5 + ret <2 x i32> %a11 +}