From 94a71361d6ada8a0e25817fe8ebe443092677e6c Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 2 Sep 2022 12:04:49 -0700 Subject: [PATCH] [Hexagon] Implement [SU]INT_TO_FP and FP_TO_[SU]INT for HVX --- llvm/lib/Target/Hexagon/HexagonISelLowering.cpp | 28 +- llvm/lib/Target/Hexagon/HexagonISelLowering.h | 20 +- llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp | 682 ++++- llvm/lib/Target/Hexagon/HexagonPatterns.td | 28 +- llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 25 + llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll | 2100 +++++++++++++++ llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll | 2744 ++++++++++++++++++++ llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll | 8 +- 8 files changed, 5567 insertions(+), 68 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index f3e1239..331e1cb 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1899,6 +1899,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VASL: return "HexagonISD::VASL"; case HexagonISD::VASR: return "HexagonISD::VASR"; case HexagonISD::VLSR: return "HexagonISD::VLSR"; + case HexagonISD::SSAT: return "HexagonISD::SSAT"; + case HexagonISD::USAT: return "HexagonISD::USAT"; case HexagonISD::VEXTRACTW: return "HexagonISD::VEXTRACTW"; case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0"; case HexagonISD::VROR: return "HexagonISD::VROR"; @@ -3290,13 +3292,25 @@ HexagonTargetLowering::LowerOperationWrapper(SDNode *N, return; } - // We are only custom-lowering stores to verify the alignment of the - // address if it is a compile-time constant. Since a store can be modified - // during type-legalization (the value being stored may need legalization), - // return empty Results here to indicate that we don't really make any - // changes in the custom lowering. - if (N->getOpcode() != ISD::STORE) - return TargetLowering::LowerOperationWrapper(N, Results, DAG); + SDValue Op(N, 0); + unsigned Opc = N->getOpcode(); + + switch (Opc) { + case HexagonISD::SSAT: + case HexagonISD::USAT: + Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG)); + break; + case ISD::STORE: + // We are only custom-lowering stores to verify the alignment of the + // address if it is a compile-time constant. Since a store can be + // modified during type-legalization (the value being stored may need + // legalization), return empty Results here to indicate that we don't + // really make any changes in the custom lowering. + return; + default: + TargetLowering::LowerOperationWrapper(N, Results, DAG); + break; + } } void diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 7e776dc..59b6a40 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -57,6 +57,8 @@ enum NodeType : unsigned { VASR, VLSR, + SSAT, // Signed saturate. + USAT, // Unsigned saturate. TSTBIT, INSERT, EXTRACTU, @@ -405,6 +407,9 @@ private: TypePair typeSplit(MVT Ty) const; MVT typeExtElem(MVT VecTy, unsigned Factor) const; MVT typeTruncElem(MVT VecTy, unsigned Factor) const; + TypePair typeExtendToWider(MVT Ty0, MVT Ty1) const; + TypePair typeWidenToWider(MVT Ty0, MVT Ty1) const; + MVT typeLegalize(MVT Ty, SelectionDAG &DAG) const; SDValue opJoin(const VectorPair &Ops, const SDLoc &dl, SelectionDAG &DAG) const; @@ -453,6 +458,12 @@ private: bool ZeroExt, SelectionDAG &DAG) const; SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const; + SDValue resizeToWidth(SDValue VecV, MVT ResTy, bool Signed, const SDLoc &dl, + SelectionDAG &DAG) const; + VectorPair emitHvxAddWithOverflow(SDValue A, SDValue B, const SDLoc &dl, + bool Signed, SelectionDAG &DAG) const; + VectorPair emitHvxShiftRightRnd(SDValue Val, unsigned Amt, bool Signed, + SelectionDAG &DAG) const; SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const; @@ -474,7 +485,10 @@ private: SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const; + SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const; + SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const; VectorPair SplitVectorOp(SDValue Op, SelectionDAG &DAG) const; @@ -484,11 +498,15 @@ private: SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const; SDValue WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const; SDValue WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const; + SDValue WidenHvxFpIntConv(SDValue Op, SelectionDAG &DAG) const; + SDValue ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG) const; + SDValue EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) const; std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; + bool shouldSplitToHvx(MVT Ty, SelectionDAG &DAG) const; bool shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const; bool isHvxOperation(SDNode *N, SelectionDAG &DAG) const; SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 1729bf0..10dc9f6 100755 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -24,6 +24,22 @@ static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 }; static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 }; static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 }; +static std::tuple getIEEEProperties(MVT Ty) { + // For a float scalar type, return (exp-bits, exp-bias, fraction-bits) + MVT ElemTy = Ty.getScalarType(); + switch (ElemTy.SimpleTy) { + case MVT::f16: + return std::make_tuple(5, 15, 10); + case MVT::f32: + return std::make_tuple(8, 127, 23); + case MVT::f64: + return std::make_tuple(11, 1023, 52); + default: + break; + } + llvm_unreachable(("Unexpected type: " + EVT(ElemTy).getEVTString()).c_str()); +} + void HexagonTargetLowering::initializeHVXLowering() { if (Subtarget.useHVX64BOps()) { @@ -214,12 +230,8 @@ HexagonTargetLowering::initializeHVXLowering() { setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); } - if (Subtarget.useHVXQFloatOps()) { - setOperationAction(ISD::SINT_TO_FP, T, Expand); - setOperationAction(ISD::UINT_TO_FP, T, Expand); - setOperationAction(ISD::FP_TO_SINT, T, Expand); - setOperationAction(ISD::FP_TO_UINT, T, Expand); - } else if (Subtarget.useHVXIEEEFPOps()) { + if (Subtarget.useHVXFloatingPoint()) { + // Same action for both QFloat and IEEE. setOperationAction(ISD::SINT_TO_FP, T, Custom); setOperationAction(ISD::UINT_TO_FP, T, Custom); setOperationAction(ISD::FP_TO_SINT, T, Custom); @@ -289,10 +301,13 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::UMAX, T, Custom); } - setOperationAction(ISD::SINT_TO_FP, T, Custom); - setOperationAction(ISD::UINT_TO_FP, T, Custom); - setOperationAction(ISD::FP_TO_SINT, T, Custom); - setOperationAction(ISD::FP_TO_UINT, T, Custom); + if (Subtarget.useHVXFloatingPoint()) { + // Same action for both QFloat and IEEE. + setOperationAction(ISD::SINT_TO_FP, T, Custom); + setOperationAction(ISD::UINT_TO_FP, T, Custom); + setOperationAction(ISD::FP_TO_SINT, T, Custom); + setOperationAction(ISD::FP_TO_UINT, T, Custom); + } } setCondCodeAction(ISD::SETNE, MVT::v64f16, Expand); @@ -380,6 +395,12 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::ANY_EXTEND, VecTy, Custom); setOperationAction(ISD::SIGN_EXTEND, VecTy, Custom); setOperationAction(ISD::ZERO_EXTEND, VecTy, Custom); + if (Subtarget.useHVXFloatingPoint()) { + setOperationAction(ISD::FP_TO_SINT, VecTy, Custom); + setOperationAction(ISD::FP_TO_UINT, VecTy, Custom); + setOperationAction(ISD::SINT_TO_FP, VecTy, Custom); + setOperationAction(ISD::UINT_TO_FP, VecTy, Custom); + } MVT BoolTy = MVT::getVectorVT(MVT::i1, N); if (!isTypeLegal(BoolTy)) @@ -419,10 +440,13 @@ HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const { // any scientific way. if (llvm::is_contained(Tys, ElemTy)) { unsigned VecWidth = VecTy.getSizeInBits(); + unsigned HwWidth = 8*HwLen; + if (VecWidth > 2*HwWidth) + return TargetLoweringBase::TypeSplitVector; + bool HaveThreshold = HvxWidenThreshold.getNumOccurrences() > 0; if (HaveThreshold && 8*HvxWidenThreshold <= VecWidth) return TargetLoweringBase::TypeWidenVector; - unsigned HwWidth = 8*HwLen; if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) return TargetLoweringBase::TypeWidenVector; } @@ -1405,6 +1429,34 @@ HexagonTargetLowering::compressHvxPred(SDValue VecQ, const SDLoc &dl, } SDValue +HexagonTargetLowering::resizeToWidth(SDValue VecV, MVT ResTy, bool Signed, + const SDLoc &dl, SelectionDAG &DAG) const { + // Take a vector and resize the element type to match the given type. + MVT InpTy = ty(VecV); + if (InpTy == ResTy) + return VecV; + + unsigned InpWidth = InpTy.getSizeInBits(); + unsigned ResWidth = ResTy.getSizeInBits(); + + if (InpTy.isFloatingPoint()) { + return InpWidth < ResWidth ? DAG.getNode(ISD::FP_EXTEND, dl, ResTy, VecV) + : DAG.getNode(ISD::FP_ROUND, dl, ResTy, VecV, + getZero(dl, MVT::i32, DAG)); + } + + assert(InpTy.isInteger()); + + if (InpWidth < ResWidth) { + unsigned ExtOpc = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + return DAG.getNode(ExtOpc, dl, ResTy, VecV); + } else { + unsigned NarOpc = Signed ? HexagonISD::SSAT : HexagonISD::USAT; + return DAG.getNode(NarOpc, dl, ResTy, VecV, DAG.getValueType(ResTy)); + } +} + +SDValue HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); @@ -1488,7 +1540,7 @@ HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) SDValue V = Elems[i]; MVT Ty = ty(V); if (!isTypeLegal(Ty)) { - EVT NTy = getTypeToTransformTo(*DAG.getContext(), Ty); + MVT NTy = typeLegalize(Ty, DAG); if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { Elems[i] = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NTy, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NTy, @@ -1934,7 +1986,7 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const { - const SDLoc &dl(Op); + const SDLoc &dl(Op); MVT ResTy = ty(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -2026,7 +2078,8 @@ HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const { - // This conversion only applies to QFloat. + // This conversion only applies to QFloat. IEEE extension from f16 to f32 + // is legal (done via a pattern). assert(Subtarget.useHVXQFloatOps()); assert(Op->getOpcode() == ISD::FP_EXTEND); @@ -2060,42 +2113,429 @@ SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op, } SDValue -HexagonTargetLowering::LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG) - const { - // This conversion only applies to IEEE. - assert(Subtarget.useHVXIEEEFPOps()); +HexagonTargetLowering::LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + // Catch invalid conversion ops (just in case). + assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT); + MVT ResTy = ty(Op); + MVT FpTy = ty(Op.getOperand(0)).getVectorElementType(); + MVT IntTy = ResTy.getVectorElementType(); + + if (Subtarget.useHVXIEEEFPOps()) { + // There are only conversions from f16. + if (FpTy == MVT::f16) { + // Other int types aren't legal in HVX, so we shouldn't see them here. + assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32); + // Conversions to i8 and i16 are legal. + if (IntTy == MVT::i8 || IntTy == MVT::i16) + return Op; + } + } + + if (IntTy.getSizeInBits() != FpTy.getSizeInBits()) + return EqualizeFpIntConversion(Op, DAG); + + return ExpandHvxFpToInt(Op, DAG); +} + +SDValue +HexagonTargetLowering::LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); // Catch invalid conversion ops (just in case). + assert(Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP); + + MVT ResTy = ty(Op); + MVT IntTy = ty(Op.getOperand(0)).getVectorElementType(); + MVT FpTy = ResTy.getVectorElementType(); + + if (Subtarget.useHVXIEEEFPOps()) { + // There are only conversions to f16. + if (FpTy == MVT::f16) { + // Other int types aren't legal in HVX, so we shouldn't see them here. + assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32); + // i8, i16 -> f16 is legal. + if (IntTy == MVT::i8 || IntTy == MVT::i16) + return Op; + } + } + + if (IntTy.getSizeInBits() != FpTy.getSizeInBits()) + return EqualizeFpIntConversion(Op, DAG); + + return ExpandHvxIntToFp(Op, DAG); +} + +HexagonTargetLowering::TypePair +HexagonTargetLowering::typeExtendToWider(MVT Ty0, MVT Ty1) const { + // Compare the widths of elements of the two types, and extend the narrower + // type to match the with of the wider type. For vector types, apply this + // to the element type. + assert(Ty0.isVector() == Ty1.isVector()); + + MVT ElemTy0 = Ty0.getScalarType(); + MVT ElemTy1 = Ty1.getScalarType(); + + unsigned Width0 = ElemTy0.getSizeInBits(); + unsigned Width1 = ElemTy1.getSizeInBits(); + unsigned MaxWidth = std::max(Width0, Width1); + + auto getScalarWithWidth = [](MVT ScalarTy, unsigned Width) { + if (ScalarTy.isInteger()) + return MVT::getIntegerVT(Width); + assert(ScalarTy.isFloatingPoint()); + return MVT::getFloatingPointVT(Width); + }; + + MVT WideETy0 = getScalarWithWidth(ElemTy0, MaxWidth); + MVT WideETy1 = getScalarWithWidth(ElemTy1, MaxWidth); + + if (!Ty0.isVector()) { + // Both types are scalars. + return {WideETy0, WideETy1}; + } + + // Vector types. + unsigned NumElem = Ty0.getVectorNumElements(); + assert(NumElem == Ty1.getVectorNumElements()); + + return {MVT::getVectorVT(WideETy0, NumElem), + MVT::getVectorVT(WideETy1, NumElem)}; +} + +HexagonTargetLowering::TypePair +HexagonTargetLowering::typeWidenToWider(MVT Ty0, MVT Ty1) const { + // Compare the numbers of elements of two vector types, and widen the + // narrower one to match the number of elements in the wider one. + assert(Ty0.isVector() && Ty1.isVector()); + + unsigned Len0 = Ty0.getVectorNumElements(); + unsigned Len1 = Ty1.getVectorNumElements(); + if (Len0 == Len1) + return {Ty0, Ty1}; + + unsigned MaxLen = std::max(Len0, Len1); + return {MVT::getVectorVT(Ty0.getVectorElementType(), MaxLen), + MVT::getVectorVT(Ty1.getVectorElementType(), MaxLen)}; +} + +MVT +HexagonTargetLowering::typeLegalize(MVT Ty, SelectionDAG &DAG) const { + EVT LegalTy = getTypeToTransformTo(*DAG.getContext(), Ty); + assert(LegalTy.isSimple()); + return LegalTy.getSimpleVT(); +} + +HexagonTargetLowering::VectorPair +HexagonTargetLowering::emitHvxAddWithOverflow(SDValue A, SDValue B, + const SDLoc &dl, bool Signed, SelectionDAG &DAG) const { + // Compute A+B, return {A+B, O}, where O = vector predicate indicating + // whether an overflow has occured. + MVT ResTy = ty(A); + assert(ResTy == ty(B)); + MVT PredTy = MVT::getVectorVT(MVT::i1, ResTy.getVectorNumElements()); + + if (!Signed) { + // V62+ has V6_vaddcarry, but it requires input predicate, so it doesn't + // save any instructions. + SDValue Add = DAG.getNode(ISD::ADD, dl, ResTy, {A, B}); + SDValue Ovf = DAG.getSetCC(dl, PredTy, Add, A, ISD::SETULT); + return {Add, Ovf}; + } + + // Signed overflow has happened, if: + // (A, B have the same sign) and (A+B has a different sign from either) + // i.e. (~A xor B) & ((A+B) xor B), then check the sign bit + SDValue Add = DAG.getNode(ISD::ADD, dl, ResTy, {A, B}); + SDValue NotA = + DAG.getNode(ISD::XOR, dl, ResTy, {A, DAG.getConstant(-1, dl, ResTy)}); + SDValue Xor0 = DAG.getNode(ISD::XOR, dl, ResTy, {NotA, B}); + SDValue Xor1 = DAG.getNode(ISD::XOR, dl, ResTy, {Add, B}); + SDValue And = DAG.getNode(ISD::AND, dl, ResTy, {Xor0, Xor1}); + SDValue MSB = + DAG.getSetCC(dl, PredTy, And, getZero(dl, ResTy, DAG), ISD::SETLT); + return {Add, MSB}; +} + +HexagonTargetLowering::VectorPair +HexagonTargetLowering::emitHvxShiftRightRnd(SDValue Val, unsigned Amt, + bool Signed, SelectionDAG &DAG) const { + // Shift Val right by Amt bits, round the result to the nearest integer, + // tie-break by rounding halves to even integer. + + const SDLoc &dl(Val); + MVT ValTy = ty(Val); + + // This should also work for signed integers. + // + // uint tmp0 = inp + ((1 << (Amt-1)) - 1); + // bool ovf = (inp > tmp0); + // uint rup = inp & (1 << (Amt+1)); + // + // uint tmp1 = inp >> (Amt-1); // tmp1 == tmp2 iff + // uint tmp2 = tmp0 >> (Amt-1); // the Amt-1 lower bits were all 0 + // uint tmp3 = tmp2 + rup; + // uint frac = (tmp1 != tmp2) ? tmp2 >> 1 : tmp3 >> 1; + unsigned ElemWidth = ValTy.getVectorElementType().getSizeInBits(); + MVT ElemTy = MVT::getIntegerVT(ElemWidth); + MVT IntTy = tyVector(ValTy, ElemTy); + MVT PredTy = MVT::getVectorVT(MVT::i1, IntTy.getVectorNumElements()); + unsigned ShRight = Signed ? ISD::SRA : ISD::SRL; + + SDValue Inp = DAG.getBitcast(IntTy, Val); + SDValue LowBits = DAG.getConstant((1u << (Amt - 1)) - 1, dl, IntTy); + + SDValue AmtP1 = DAG.getConstant(1u << Amt, dl, IntTy); + SDValue And = DAG.getNode(ISD::AND, dl, IntTy, {Inp, AmtP1}); + SDValue Zero = getZero(dl, IntTy, DAG); + SDValue Bit = DAG.getSetCC(dl, PredTy, And, Zero, ISD::SETNE); + SDValue Rup = DAG.getZExtOrTrunc(Bit, dl, IntTy); + auto [Tmp0, Ovf] = emitHvxAddWithOverflow(Inp, LowBits, dl, Signed, DAG); + + SDValue AmtM1 = DAG.getConstant(Amt - 1, dl, IntTy); + SDValue Tmp1 = DAG.getNode(ShRight, dl, IntTy, Inp, AmtM1); + SDValue Tmp2 = DAG.getNode(ShRight, dl, IntTy, Tmp0, AmtM1); + SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, IntTy, Tmp2, Rup); + + SDValue Eq = DAG.getSetCC(dl, PredTy, Tmp1, Tmp2, ISD::SETEQ); + SDValue One = DAG.getConstant(1, dl, IntTy); + SDValue Tmp4 = DAG.getNode(ShRight, dl, IntTy, {Tmp2, One}); + SDValue Tmp5 = DAG.getNode(ShRight, dl, IntTy, {Tmp3, One}); + SDValue Mux = DAG.getNode(ISD::VSELECT, dl, IntTy, {Eq, Tmp5, Tmp4}); + return {Mux, Ovf}; +} + +SDValue +HexagonTargetLowering::EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) + const { + // Rewrite conversion between integer and floating-point in such a way that + // the integer type is extended/narrowed to match the bitwidth of the + // floating-point type, combined with additional integer-integer extensions + // or narrowings to match the original input/result types. + // E.g. f32 -> i8 ==> f32 -> i32 -> i8 + // + // The input/result types are not required to be legal, but if they are + // legal, this function should not introduce illegal types. + + unsigned Opc = Op.getOpcode(); assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT || Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP); + + SDValue Inp = Op.getOperand(0); + MVT InpTy = ty(Inp); MVT ResTy = ty(Op); - if (Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT) { - MVT FpTy = ty(Op.getOperand(0)).getVectorElementType(); - // There are only conversions of f16. - if (FpTy != MVT::f16) - return SDValue(); - - MVT IntTy = ResTy.getVectorElementType(); - // Other int types aren't legal in HVX, so we shouldn't see them here. - assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32); - // Conversions to i8 and i16 are legal. - if (IntTy == MVT::i8 || IntTy == MVT::i16) - return Op; + if (InpTy == ResTy) + return Op; + + const SDLoc &dl(Op); + bool Signed = Opc == ISD::FP_TO_SINT || Opc == ISD::SINT_TO_FP; + + auto [WInpTy, WResTy] = typeExtendToWider(InpTy, ResTy); + SDValue WInp = resizeToWidth(Inp, WInpTy, Signed, dl, DAG); + SDValue Conv = DAG.getNode(Opc, dl, WResTy, WInp); + SDValue Res = resizeToWidth(Conv, ResTy, Signed, dl, DAG); + return Res; +} + +SDValue +HexagonTargetLowering::ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT); + + const SDLoc &dl(Op); + SDValue Op0 = Op.getOperand(0); + MVT InpTy = ty(Op0); + MVT ResTy = ty(Op); + assert(InpTy.changeTypeToInteger() == ResTy); + + // int32_t conv_f32_to_i32(uint32_t inp) { + // // s | exp8 | frac23 + // + // int neg = (int32_t)inp < 0; + // + // // "expm1" is the actual exponent minus 1: instead of "bias", subtract + // // "bias+1". When the encoded exp is "all-1" (i.e. inf/nan), this will + // // produce a large positive "expm1", which will result in max u/int. + // // In all IEEE formats, bias is the largest positive number that can be + // // represented in bias-width bits (i.e. 011..1). + // int32_t expm1 = (inp << 1) - 0x80000000; + // expm1 >>= 24; + // + // // Always insert the "implicit 1". Subnormal numbers will become 0 + // // regardless. + // uint32_t frac = (inp << 8) | 0x80000000; + // + // // "frac" is the fraction part represented as Q1.31. If it was + // // interpreted as uint32_t, it would be the fraction part multiplied + // // by 2^31. + // + // // Calculate the amount of right shift, since shifting further to the + // // left would lose significant bits. Limit it to 32, because we want + // // shifts by 32+ to produce 0, whereas V6_vlsrwv treats the shift + // // amount as a 6-bit signed value (so 33 is same as -31, i.e. shift + // // left by 31). "rsh" can be negative. + // int32_t rsh = min(31 - (expm1 + 1), 32); + // + // frac >>= rsh; // rsh == 32 will produce 0 + // + // // Everything up to this point is the same for conversion to signed + // // unsigned integer. + // + // if (neg) // Only for signed int + // frac = -frac; // + // if (rsh <= 0 && neg) // bound = neg ? 0x80000000 : 0x7fffffff + // frac = 0x80000000; // frac = rsh <= 0 ? bound : frac + // if (rsh <= 0 && !neg) // + // frac = 0x7fffffff; // + // + // if (neg) // Only for unsigned int + // frac = 0; // + // if (rsh < 0 && !neg) // frac = rsh < 0 ? 0x7fffffff : frac; + // frac = 0x7fffffff; // frac = neg ? 0 : frac; + // + // return frac; + // } + + MVT PredTy = MVT::getVectorVT(MVT::i1, ResTy.getVectorElementCount()); + + // Zero = V6_vd0(); + // Neg = V6_vgtw(Zero, Inp); + // One = V6_lvsplatw(1); + // M80 = V6_lvsplatw(0x80000000); + // Exp00 = V6_vaslwv(Inp, One); + // Exp01 = V6_vsubw(Exp00, M80); + // ExpM1 = V6_vasrw(Exp01, 24); + // Frc00 = V6_vaslw(Inp, 8); + // Frc01 = V6_vor(Frc00, M80); + // Rsh00 = V6_vsubw(V6_lvsplatw(30), ExpM1); + // Rsh01 = V6_vminw(Rsh00, V6_lvsplatw(32)); + // Frc02 = V6_vlsrwv(Frc01, Rsh01); + + // if signed int: + // Bnd = V6_vmux(Neg, M80, V6_lvsplatw(0x7fffffff)) + // Pos = V6_vgtw(Rsh01, Zero); + // Frc13 = V6_vsubw(Zero, Frc02); + // Frc14 = V6_vmux(Neg, Frc13, Frc02); + // Int = V6_vmux(Pos, Frc14, Bnd); + // + // if unsigned int: + // Rsn = V6_vgtw(Zero, Rsh01) + // Frc23 = V6_vmux(Rsn, V6_lvsplatw(0x7fffffff), Frc02) + // Int = V6_vmux(Neg, Zero, Frc23) + + auto [ExpWidth, ExpBias, FracWidth] = getIEEEProperties(InpTy); + unsigned ElemWidth = 1 + ExpWidth + FracWidth; + assert(1u << (ExpWidth - 1) == 1 + ExpBias); + + SDValue Inp = DAG.getBitcast(ResTy, Op0); + SDValue Zero = getZero(dl, ResTy, DAG); + SDValue Neg = DAG.getSetCC(dl, PredTy, Inp, Zero, ISD::SETLT); + SDValue M80 = DAG.getConstant(1u << (ElemWidth - 1), dl, ResTy); + SDValue M7F = DAG.getConstant((1u << (ElemWidth - 1)) - 1, dl, ResTy); + SDValue One = DAG.getConstant(1, dl, ResTy); + SDValue Exp00 = DAG.getNode(ISD::SHL, dl, ResTy, {Inp, One}); + SDValue Exp01 = DAG.getNode(ISD::SUB, dl, ResTy, {Exp00, M80}); + SDValue MNE = DAG.getConstant(ElemWidth - ExpWidth, dl, ResTy); + SDValue ExpM1 = DAG.getNode(ISD::SRA, dl, ResTy, {Exp01, MNE}); + + SDValue ExpW = DAG.getConstant(ExpWidth, dl, ResTy); + SDValue Frc00 = DAG.getNode(ISD::SHL, dl, ResTy, {Inp, ExpW}); + SDValue Frc01 = DAG.getNode(ISD::OR, dl, ResTy, {Frc00, M80}); + + SDValue MN2 = DAG.getConstant(ElemWidth - 2, dl, ResTy); + SDValue Rsh00 = DAG.getNode(ISD::SUB, dl, ResTy, {MN2, ExpM1}); + SDValue MW = DAG.getConstant(ElemWidth, dl, ResTy); + SDValue Rsh01 = DAG.getNode(ISD::SMIN, dl, ResTy, {Rsh00, MW}); + SDValue Frc02 = DAG.getNode(ISD::SRL, dl, ResTy, {Frc01, Rsh01}); + + SDValue Int; + + if (Opc == ISD::FP_TO_SINT) { + SDValue Bnd = DAG.getNode(ISD::VSELECT, dl, ResTy, {Neg, M80, M7F}); + SDValue Pos = DAG.getSetCC(dl, PredTy, Rsh01, Zero, ISD::SETGT); + SDValue Frc13 = DAG.getNode(ISD::SUB, dl, ResTy, {Zero, Frc02}); + SDValue Frc14 = DAG.getNode(ISD::VSELECT, dl, ResTy, {Neg, Frc13, Frc02}); + Int = DAG.getNode(ISD::VSELECT, dl, ResTy, {Pos, Frc14, Bnd}); } else { - // Converting int -> fp. - if (ResTy.getVectorElementType() != MVT::f16) - return SDValue(); - MVT IntTy = ty(Op.getOperand(0)).getVectorElementType(); - // Other int types aren't legal in HVX, so we shouldn't see them here. - assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32); - // i8, i16 -> f16 is legal. - if (IntTy == MVT::i8 || IntTy == MVT::i16) - return Op; + assert(Opc == ISD::FP_TO_UINT); + SDValue Rsn = DAG.getSetCC(dl, PredTy, Rsh01, Zero, ISD::SETLT); + SDValue Frc23 = DAG.getNode(ISD::VSELECT, dl, ResTy, Rsn, M7F, Frc02); + Int = DAG.getNode(ISD::VSELECT, dl, ResTy, Neg, Zero, Frc23); } - return SDValue(); + return Int; +} + +SDValue +HexagonTargetLowering::ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + assert(Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP); + + const SDLoc &dl(Op); + SDValue Op0 = Op.getOperand(0); + MVT InpTy = ty(Op0); + MVT ResTy = ty(Op); + assert(ResTy.changeTypeToInteger() == InpTy); + + // uint32_t vnoc1_rnd(int32_t w) { + // int32_t iszero = w == 0; + // int32_t isneg = w < 0; + // uint32_t u = __builtin_HEXAGON_A2_abs(w); + // + // uint32_t norm_left = __builtin_HEXAGON_S2_cl0(u) + 1; + // uint32_t frac0 = (uint64_t)u << norm_left; + // + // // Rounding: + // uint32_t frac1 = frac0 + ((1 << 8) - 1); + // uint32_t renorm = (frac0 > frac1); + // uint32_t rup = (int)(frac0 << 22) < 0; + // + // uint32_t frac2 = frac0 >> 8; + // uint32_t frac3 = frac1 >> 8; + // uint32_t frac = (frac2 != frac3) ? frac3 >> 1 : (frac3 + rup) >> 1; + // + // int32_t exp = 32 - norm_left + renorm + 127; + // exp <<= 23; + // + // uint32_t sign = 0x80000000 * isneg; + // uint32_t f = sign | exp | frac; + // return iszero ? 0 : f; + // } + + MVT PredTy = MVT::getVectorVT(MVT::i1, InpTy.getVectorElementCount()); + bool Signed = Opc == ISD::SINT_TO_FP; + + auto [ExpWidth, ExpBias, FracWidth] = getIEEEProperties(ResTy); + unsigned ElemWidth = 1 + ExpWidth + FracWidth; + + SDValue Zero = getZero(dl, InpTy, DAG); + SDValue One = DAG.getConstant(1, dl, InpTy); + SDValue IsZero = DAG.getSetCC(dl, PredTy, Op0, Zero, ISD::SETEQ); + SDValue Abs = Signed ? DAG.getNode(ISD::ABS, dl, InpTy, Op0) : Op0; + SDValue Clz = DAG.getNode(ISD::CTLZ, dl, InpTy, Abs); + SDValue NLeft = DAG.getNode(ISD::ADD, dl, InpTy, {Clz, One}); + SDValue Frac0 = DAG.getNode(ISD::SHL, dl, InpTy, {Abs, NLeft}); + + auto [Frac, Ovf] = emitHvxShiftRightRnd(Frac0, ExpWidth + 1, false, DAG); + if (Signed) { + SDValue IsNeg = DAG.getSetCC(dl, PredTy, Op0, Zero, ISD::SETLT); + SDValue M80 = DAG.getConstant(1 << (ElemWidth - 1), dl, InpTy); + SDValue Sign = DAG.getNode(ISD::VSELECT, dl, InpTy, {IsNeg, M80, Zero}); + Frac = DAG.getNode(ISD::OR, dl, InpTy, {Sign, Frac}); + } + + SDValue Rnrm = DAG.getZExtOrTrunc(Ovf, dl, InpTy); + SDValue Exp0 = DAG.getConstant(ElemWidth + ExpBias, dl, InpTy); + SDValue Exp1 = DAG.getNode(ISD::ADD, dl, InpTy, {Rnrm, Exp0}); + SDValue Exp2 = DAG.getNode(ISD::SUB, dl, InpTy, {Exp1, NLeft}); + SDValue Exp3 = DAG.getNode(ISD::SHL, dl, InpTy, + {Exp2, DAG.getConstant(FracWidth, dl, InpTy)}); + SDValue Flt0 = DAG.getNode(ISD::OR, dl, InpTy, {Frac, Exp3}); + SDValue Flt1 = DAG.getNode(ISD::VSELECT, dl, InpTy, {IsZero, Zero, Flt0}); + SDValue Flt = DAG.getBitcast(ResTy, Flt1); + + return Flt; } HexagonTargetLowering::VectorPair @@ -2116,6 +2556,8 @@ HexagonTargetLowering::SplitVectorOp(SDValue Op, SelectionDAG &DAG) const { // Special case for type operand. switch (Op.getOpcode()) { case ISD::SIGN_EXTEND_INREG: + case HexagonISD::SSAT: + case HexagonISD::USAT: if (const auto *N = dyn_cast(A.getNode())) std::tie(Lo, Hi) = SplitVTNode(N); break; @@ -2298,7 +2740,7 @@ HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const { SDValue SetCC = DAG.getNode(ISD::SETCC, dl, ResTy, {WideOp0, WideOp1, Op.getOperand(2)}); - EVT RetTy = getTypeToTransformTo(*DAG.getContext(), ty(Op)); + EVT RetTy = typeLegalize(ty(Op), DAG); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RetTy, {SetCC, getZero(dl, MVT::i32, DAG)}); } @@ -2472,9 +2914,9 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::LOAD: return SDValue(); case ISD::FP_EXTEND: return LowerHvxFpExtend(Op, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: + case ISD::FP_TO_UINT: return LowerHvxFpToInt(Op, DAG); case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: return LowerHvxConvertFpInt(Op, DAG); + case ISD::UINT_TO_FP: return LowerHvxIntToFp(Op, DAG); } #ifndef NDEBUG Op.dumpr(&DAG); @@ -2482,29 +2924,96 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Unhandled HVX operation"); } +SDValue +HexagonTargetLowering::ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG) + const { + // Rewrite the extension/truncation/saturation op into steps where each + // step changes the type widths by a factor of 2. + // E.g. i8 -> i16 remains unchanged, but i8 -> i32 ==> i8 -> i16 -> i32. + // + // Some of the vector types in Op may not be legal. + + bool NeedVT = false; + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case HexagonISD::SSAT: + case HexagonISD::USAT: + NeedVT = true; + [[fallthrough]]; + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::TRUNCATE: + break; +#ifndef NDEBUG + Op.dump(&DAG); +#endif + llvm_unreachable("Unexpected operation"); + } + + SDValue Inp = Op.getOperand(0); + MVT InpTy = ty(Inp); + MVT ResTy = ty(Op); + + unsigned InpWidth = InpTy.getVectorElementType().getSizeInBits(); + unsigned ResWidth = ResTy.getVectorElementType().getSizeInBits(); + assert(InpWidth != ResWidth); + + if (InpWidth == 2 * ResWidth || ResWidth == 2 * InpWidth) + return Op; + + const SDLoc &dl(Op); + unsigned NumElems = InpTy.getVectorNumElements(); + assert(NumElems == ResTy.getVectorNumElements()); + + auto repeatOp = [&](unsigned NewWidth, SDValue Arg) { + MVT Ty = MVT::getVectorVT(MVT::getIntegerVT(NewWidth), NumElems); + SmallVector Args = {Arg}; + if (NeedVT) + Args.push_back(DAG.getValueType(Ty)); + return DAG.getNode(Opc, dl, Ty, Args); + }; + + SDValue S = Inp; + if (InpWidth < ResWidth) { + assert(ResWidth % InpWidth == 0 && isPowerOf2_32(ResWidth / InpWidth)); + while (InpWidth * 2 <= ResWidth) + S = repeatOp(InpWidth *= 2, S); + } else { + // InpWidth > ResWidth + assert(InpWidth % ResWidth == 0 && isPowerOf2_32(InpWidth / ResWidth)); + while (InpWidth / 2 >= ResWidth) + S = repeatOp(InpWidth /= 2, S); + } + return S; +} + void HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { unsigned Opc = N->getOpcode(); SDValue Op(N, 0); + SDValue Inp0; // Optional first argument. + if (N->getNumOperands() > 0) + Inp0 = Op.getOperand(0); switch (Opc) { case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) { + if (shouldWidenToHvx(ty(Inp0), DAG)) { if (SDValue T = WidenHvxExtend(Op, DAG)) Results.push_back(T); } break; case ISD::SETCC: - if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) { + if (shouldWidenToHvx(ty(Inp0), DAG)) { if (SDValue T = WidenHvxSetCC(Op, DAG)) Results.push_back(T); } break; case ISD::TRUNCATE: - if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) { + if (shouldWidenToHvx(ty(Inp0), DAG)) { if (SDValue T = WidenHvxTruncate(Op, DAG)) Results.push_back(T); } @@ -2530,6 +3039,29 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N, Results.push_back(S); } break; + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + if (ty(Op).getSizeInBits() != ty(Inp0).getSizeInBits()) { + SDValue T = EqualizeFpIntConversion(Op, DAG); + Results.push_back(T); + } + break; + case HexagonISD::SSAT: + case HexagonISD::USAT: + if (SDValue T = ExpandHvxResizeIntoSteps(Op, DAG); T != Op) { + Results.push_back(T); + } else if (shouldWidenToHvx(ty(Op), DAG)) { + SDValue W = appendUndef(Inp0, typeJoin({ty(Inp0), ty(Inp0)}), DAG); + MVT WideTy = typeJoin({ty(Op), ty(Op)}); + SDValue T = + DAG.getNode(Opc, SDLoc(Op), WideTy, W, DAG.getValueType(WideTy)); + Results.push_back(T); + } else if (shouldSplitToHvx(ty(Inp0), DAG)) { + Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG)); + } + break; default: break; } @@ -2540,6 +3072,10 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { unsigned Opc = N->getOpcode(); SDValue Op(N, 0); + SDValue Inp0; // Optional first argument. + if (N->getNumOperands() > 0) + Inp0 = Op.getOperand(0); + switch (Opc) { case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -2571,12 +3107,43 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N, break; } case ISD::BITCAST: - if (isHvxBoolTy(ty(N->getOperand(0)))) { - SDValue Op(N, 0); + if (isHvxBoolTy(ty(Inp0))) { SDValue C = LowerHvxBitcast(Op, DAG); Results.push_back(C); } break; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + if (ty(Op).getSizeInBits() != ty(Inp0).getSizeInBits()) { + SDValue T = EqualizeFpIntConversion(Op, DAG); + Results.push_back(T); + } + break; + case HexagonISD::SSAT: + case HexagonISD::USAT: + if (shouldWidenToHvx(ty(Op), DAG)) { + MVT InpTy = ty(Inp0); + MVT WResTy = typeLegalize(ty(Op), DAG); + if (Subtarget.isHVXVectorType(InpTy, true)) { + // If the input is legal it won't be auto-legalized, so we + // need to pad it explicitly. + MVT WInpTy = typeWidenToWider(InpTy, WResTy).first; + Inp0 = appendUndef(Inp0, WInpTy, DAG); + } + SDValue S = DAG.getNode(Opc, SDLoc(Op), WResTy, Inp0, + DAG.getValueType(WResTy)); + SDValue T = ExpandHvxResizeIntoSteps(S, DAG); + Results.push_back(T); + } else { + // Check if we need to split (for example when scalarizing). + MVT LResTy = typeLegalize(ty(Op), DAG); + if (!Subtarget.isHVXVectorType(LResTy, true)) { + Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG)); + } else { + llvm_unreachable(""); + } + } + break; default: break; } @@ -2638,13 +3205,22 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) } bool +HexagonTargetLowering::shouldSplitToHvx(MVT Ty, SelectionDAG &DAG) const { + if (Subtarget.isHVXVectorType(Ty, true)) + return false; + auto Action = getPreferredHvxVectorAction(Ty); + if (Action == TargetLoweringBase::TypeSplitVector) + return Subtarget.isHVXVectorType(typeLegalize(Ty, DAG), true); + return false; +} + +bool HexagonTargetLowering::shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const { + if (Subtarget.isHVXVectorType(Ty, true)) + return false; auto Action = getPreferredHvxVectorAction(Ty); - if (Action == TargetLoweringBase::TypeWidenVector) { - EVT WideTy = getTypeToTransformTo(*DAG.getContext(), Ty); - assert(WideTy.isSimple()); - return Subtarget.isHVXVectorType(WideTy.getSimpleVT(), true); - } + if (Action == TargetLoweringBase::TypeWidenVector) + return Subtarget.isHVXVectorType(typeLegalize(Ty, DAG), true); return false; } diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 80fbf33..cbb437c 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -10,7 +10,7 @@ // (0) Definitions // (1) Immediates // (2) Type casts -// (3) Extend/truncate +// (3) Extend/truncate/saturate // (4) Logical // (5) Compare // (6) Select @@ -98,6 +98,11 @@ def HexagonPFALSE: SDNode<"HexagonISD::PFALSE", SDTVecLeaf>; def HexagonVALIGN: SDNode<"HexagonISD::VALIGN", SDTVecVecIntOp>; def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>; +def SDTSaturate: + SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; +def HexagonSSAT: SDNode<"HexagonISD::SSAT", SDTSaturate>; +def HexagonUSAT: SDNode<"HexagonISD::USAT", SDTSaturate>; + def ptrue: PatFrag<(ops), (HexagonPTRUE)>; def pfalse: PatFrag<(ops), (HexagonPFALSE)>; def pnot: PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>; @@ -106,6 +111,9 @@ def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru), (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>; def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>; +def ssat: PatFrag<(ops node:$V, node:$Ty), (HexagonSSAT node:$V, node:$Ty)>; +def usat: PatFrag<(ops node:$V, node:$Ty), (HexagonUSAT node:$V, node:$Ty)>; + // Pattern fragments to extract the low and high subregisters from a // 64-bit value. def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>; @@ -477,7 +485,7 @@ defm: NopCast_pat; defm: NopCast_pat; -// --(3) Extend/truncate ------------------------------------------------- +// --(3) Extend/truncate/saturate ---------------------------------------- // def: Pat<(sext_inreg I32:$Rs, i8), (A2_sxtb I32:$Rs)>; @@ -553,6 +561,22 @@ def: Pat<(v4i8 (trunc V4I16:$Rs)), def: Pat<(v2i16 (trunc V2I32:$Rs)), (A2_combine_ll (HiReg $Rs), (LoReg $Rs))>; +// Saturation: +// Note: saturation assumes the same signed-ness for the input and the +// output. +def: Pat<(i32 (ssat I32:$Rs, i8)), (A2_satb I32:$Rs)>; +def: Pat<(i32 (ssat I32:$Rs, i16)), (A2_sath I32:$Rs)>; +def: Pat<(i32 (ssat I64:$Rs, i32)), (A2_sat I64:$Rs)>; +def: Pat<(i32 (usat I32:$Rs, i8)), (A2_satub I32:$Rs)>; +def: Pat<(i32 (usat I32:$Rs, i16)), (A2_satuh I32:$Rs)>; +def: Pat<(i32 (usat I64:$Rs, i32)), + (C2_mux (C2_cmpeqi (HiReg $Rs), (i32 0)), (LoReg $Rs), (i32 -1))>; + +def: Pat<(v4i8 (ssat V4I16:$Rs, v4i8)), (S2_vsathb V4I16:$Rs)>; +def: Pat<(v2i16 (ssat V2I32:$Rs, v2i16)), (S2_vsatwh V2I32:$Rs)>; +def: Pat<(v4i8 (usat V4I16:$Rs, v4i8)), (S2_vsathub V4I16:$Rs)>; +def: Pat<(v2i16 (usat V2I32:$Rs, v2i16)), (S2_vsatwuh V2I32:$Rs)>; + // --(4) Logical --------------------------------------------------------- // diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 119330e..e961612 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -904,3 +904,28 @@ let Predicates = [UseHVXV62], AddedComplexity = 20 in { def: Pat<(VecI16 (abs HVI16:$Vs)), (V6_vabsh HvxVR:$Vs)>; def: Pat<(VecI32 (abs HVI32:$Vs)), (V6_vabsw HvxVR:$Vs)>; +// If a node takes an MVT type as a parameter, the argument must be +// a name of a member of MVT. +multiclass Saturates { + def: Pat<(VecI8 (ssat HWI16:$Vss, HvxTy_i8)), + (V6_vpackhb_sat (HiVec $Vss), (LoVec $Vss))>; + def: Pat<(VecI8 (ssat (concat_vectors HWI32:$Vss, HWI32:$Vtt), HvxTy_i8)), + (V6_vpackhb_sat (V6_vpackwh_sat (HiVec $Vtt), (LoVec $Vtt)), + (V6_vpackwh_sat (HiVec $Vss), (LoVec $Vss)))>; + def: Pat<(VecI16 (ssat HWI32:$Vss, HvxTy_i16)), + (V6_vpackwh_sat (HiVec $Vss), (LoVec $Vss))>; + + def: Pat<(VecI8 (usat HWI16:$Vss, HvxTy_i8)), + (V6_vpackhub_sat (HiVec $Vss), (LoVec $Vss))>; + def: Pat<(VecI8 (usat (concat_vectors HWI32:$Vss, HWI32:$Vtt), HvxTy_i8)), + (V6_vpackhub_sat (V6_vpackwuh_sat (HiVec $Vtt), (LoVec $Vtt)), + (V6_vpackwuh_sat (HiVec $Vss), (LoVec $Vss)))>; + def: Pat<(VecI16 (usat HWI32:$Vss, HvxTy_i16)), + (V6_vpackwuh_sat (HiVec $Vss), (LoVec $Vss))>; +} +let Predicates = [UseHVX64B] in { + defm: Saturates; +} +let Predicates = [UseHVX128B] in { + defm: Saturates; +} diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll new file mode 100644 index 0000000..9ea5d11 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll @@ -0,0 +1,2100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; f16 -> s8 +; No widening +define void @f16s8_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16s8_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##32768,#1) +; CHECK-NEXT: r4 = #14 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r3) +; CHECK-NEXT: r6 = #5 +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: v1 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r4) +; CHECK-NEXT: v8.h = vasl(v1.h,r2) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = combine(#11,##32767) +; CHECK-NEXT: v7 = vxor(v7,v7) +; CHECK-NEXT: v8.h = vsub(v8.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = #16 +; CHECK-NEXT: v5.h = vasl(v0.h,r6) +; CHECK-NEXT: q1 = vcmp.gt(v7.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vsplat(r3) +; CHECK-NEXT: v28.h = vasr(v3.h,r5) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: q0 = vcmp.gt(v7.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.h = vsplat(r4) +; CHECK-NEXT: v8.h = vasr(v8.h,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.h = vasl(v1.h,r6) +; CHECK-NEXT: v1.h = vsub(v4.h,v28.h) +; CHECK-NEXT: v4.h = vsub(v4.h,v8.h) +; CHECK-NEXT: v29 = vmux(q0,v2,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vmin(v1.h,v6.h) +; CHECK-NEXT: v0 = vor(v27,v2) +; CHECK-NEXT: v4.h = vmin(v4.h,v6.h) +; CHECK-NEXT: v2 = vmux(q1,v2,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q2 = vcmp.gt(v1.h,v7.h) +; CHECK-NEXT: q3 = vcmp.gt(v4.h,v7.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vlsr(v5.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vlsr(v0.h,v4.h) +; CHECK-NEXT: v30.h = vsub(v7.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.h = vsub(v7.h,v0.h) +; CHECK-NEXT: v5 = vmux(q0,v30,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v31,v0) +; CHECK-NEXT: v1 = vmux(q2,v5,v29) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.b = vpack(v0.h,v1.h):sat +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <128 x half>, ptr %a0, align 128 + %v1 = fptosi <128 x half> %v0 to <128 x i8> + store <128 x i8> %v1, ptr %a1, align 128 + ret void +} + +; Widen result +define void @f16s8_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16s8_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##32768 +; CHECK-NEXT: r3:2 = combine(#5,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: r6 = #14 +; CHECK-NEXT: r5 = #11 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r6) +; CHECK-NEXT: r4 = #16 +; CHECK-NEXT: v6.h = vasl(v0.h,r3) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r4) +; CHECK-NEXT: r3 = #32767 +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.h = vsplat(r3) +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.h = vasr(v3.h,r5) +; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v3.h = vsub(v4.h,v3.h) +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vmin(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q2 = vcmp.gt(v3.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.h = vsub(v1.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.b = vpack(v0.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <64 x half>, ptr %a0, align 128 + %v1 = fptosi <64 x half> %v0 to <64 x i8> + store <64 x i8> %v1, ptr %a1, align 128 + ret void +} + +; f16 -> s16 +; No widening +define void @f16s16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16s16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##32768 +; CHECK-NEXT: r3:2 = combine(#5,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: r6 = #14 +; CHECK-NEXT: r5 = #11 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r6) +; CHECK-NEXT: r4 = #16 +; CHECK-NEXT: v6.h = vasl(v0.h,r3) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r4) +; CHECK-NEXT: r2 = #32767 +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.h = vsplat(r2) +; CHECK-NEXT: v3.h = vasr(v3.h,r5) +; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vsub(v4.h,v3.h) +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vmin(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vcmp.gt(v3.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.h = vsub(v1.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v0,v2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x half>, ptr %a0, align 128 + %v1 = fptosi <64 x half> %v0 to <64 x i16> + store <64 x i16> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @f16s16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16s16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##32768 +; CHECK-NEXT: r3:2 = combine(#5,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: r6 = #14 +; CHECK-NEXT: r5 = #11 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r6) +; CHECK-NEXT: r4 = #16 +; CHECK-NEXT: v6.h = vasl(v0.h,r3) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r4) +; CHECK-NEXT: r3 = #32767 +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.h = vsplat(r3) +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.h = vasr(v3.h,r5) +; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v3.h = vsub(v4.h,v3.h) +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vmin(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q1 = vcmp.gt(v3.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.h = vsub(v1.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x half>, ptr %a0, align 128 + %v1 = fptosi <32 x half> %v0 to <32 x i16> + store <32 x i16> %v1, ptr %a1, align 128 + ret void +} + +; f16 -> s32 +; No widening +define void @f16s32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16s32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #15360 +; CHECK-NEXT: r7 = #-4 +; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vsplat(r2) +; CHECK-NEXT: r6 = ##-2147483648 +; CHECK-NEXT: r2 = #1 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: r5 = #8 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v24 = vxor(v24,v24) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0.qf32 = vmpy(v1.hf,v0.hf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.sf = v0.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.sf = v1.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = combine(#30,#24) +; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r7) +; CHECK-NEXT: q0 = vcmp.gt(v24.w,v1.w) +; CHECK-NEXT: q1 = vcmp.gt(v24.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v4.w = vasl(v1.w,r2) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25 = vsplat(r2) +; CHECK-NEXT: v5.w = vasl(v0.w,r5) +; CHECK-NEXT: v4.w = vsub(v4.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasr(v3.w,r6) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: v28 = vmux(q0,v2,v25) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vasr(v4.w,r6) +; CHECK-NEXT: v3.w = vsub(v6.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v1.w,r5) +; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) +; CHECK-NEXT: v3.w = vmin(v3.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8 = vor(v8,v2) +; CHECK-NEXT: v4.w = vmin(v4.w,v7.w) +; CHECK-NEXT: v2 = vmux(q1,v2,v25) +; CHECK-NEXT: q3 = vcmp.gt(v3.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26.w = vlsr(v5.w,v3.w) +; CHECK-NEXT: q2 = vcmp.gt(v4.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.w = vlsr(v8.w,v4.w) +; CHECK-NEXT: v29.w = vsub(v24.w,v26.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.w = vsub(v24.w,v27.w) +; CHECK-NEXT: v1 = vmux(q1,v29,v26) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q0,v9,v27) +; CHECK-NEXT: v31 = vmux(q3,v1,v2) +; CHECK-NEXT: vmem(r1+#0) = v31.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v30,v28) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x half>, ptr %a0, align 128 + %v1 = fptosi <64 x half> %v0 to <64 x i32> + store <64 x i32> %v1, ptr %a1, align 128 + ret void +} + +; Widen input +define void @f16s32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16s32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #15360 +; CHECK-NEXT: r7 = #-4 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vsplat(r4) +; CHECK-NEXT: r2 = ##-2147483648 +; CHECK-NEXT: r3 = #1 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r2) +; CHECK-NEXT: r5:4 = combine(#8,#30) +; CHECK-NEXT: r6 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r4) +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf) +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v30 = vsplat(r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.sf = v0.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.sf = v1.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v0.w,r3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v0.w,r5) +; CHECK-NEXT: v1.w = vsub(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29 = vor(v6,v3) +; CHECK-NEXT: v3 = vmux(q0,v3,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasr(v1.w,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v4.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vmin(v1.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vcmp.gt(v1.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vlsr(v29.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.w = vsub(v2.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v0,v3) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x half>, ptr %a0, align 128 + %v1 = fptosi <32 x half> %v0 to <32 x i32> + store <32 x i32> %v1, ptr %a1, align 128 + ret void +} + +; f32 -> s8 +; No widening +define void @f32s8_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s8_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(#1,#8) +; CHECK-NEXT: r4 = ##-2147483648 +; CHECK-NEXT: v6 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vsplat(r4) +; CHECK-NEXT: r7 = #30 +; CHECK-NEXT: r6 = #24 +; CHECK-NEXT: v4 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10 = vsplat(r7) +; CHECK-NEXT: r5 = #32 +; CHECK-NEXT: v9.w = vasl(v6.w,r3) +; CHECK-NEXT: v1 = vmem(r0+#3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v4.w,r3) +; CHECK-NEXT: v14 = vxor(v14,v14) +; CHECK-NEXT: v9.w = vsub(v9.w,v0.w) +; CHECK-NEXT: v2 = vmem(r0+#2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v13 = vsplat(r5) +; CHECK-NEXT: v11.w = vasl(v2.w,r3) +; CHECK-NEXT: v8.w = vsub(v8.w,v0.w) +; CHECK-NEXT: q1 = vcmp.gt(v14.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.w = vasl(v1.w,r3) +; CHECK-NEXT: q0 = vcmp.gt(v14.w,v4.w) +; CHECK-NEXT: v11.w = vsub(v11.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: r7 = #64 +; CHECK-NEXT: v9.w = vasr(v9.w,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20 = vsplat(r3) +; CHECK-NEXT: v7.w = vasl(v6.w,r2) +; CHECK-NEXT: v21.w = vsub(v12.w,v0.w) +; CHECK-NEXT: v9.w = vsub(v10.w,v9.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: v27 = vmux(q1,v0,v20) +; CHECK-NEXT: v25 = vmux(q0,v0,v20) +; CHECK-NEXT: v9.w = vmin(v9.w,v13.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vasl(v4.w,r2) +; CHECK-NEXT: v7 = vor(v7,v0) +; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) +; CHECK-NEXT: q3 = vcmp.gt(v9.w,v14.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.w = vasr(v11.w,r6) +; CHECK-NEXT: v8.w = vmin(v8.w,v13.w) +; CHECK-NEXT: v5 = vor(v5,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasr(v21.w,r6) +; CHECK-NEXT: v11.w = vsub(v10.w,v11.w) +; CHECK-NEXT: q2 = vcmp.gt(v8.w,v14.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v1.w,r2) +; CHECK-NEXT: v6.w = vsub(v10.w,v6.w) +; CHECK-NEXT: v23.w = vmin(v11.w,v13.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v22.w = vasl(v2.w,r2) +; CHECK-NEXT: v3 = vor(v3,v0) +; CHECK-NEXT: v6.w = vmin(v6.w,v13.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w) +; CHECK-NEXT: v12 = vor(v22,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vlsr(v5.w,v8.w) +; CHECK-NEXT: v26.w = vsub(v14.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.w = vlsr(v12.w,v23.w) +; CHECK-NEXT: v24.w = vsub(v14.w,v5.w) +; CHECK-NEXT: v7 = vmux(q1,v26,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vlsr(v3.w,v6.w) +; CHECK-NEXT: v5 = vmux(q0,v24,v5) +; CHECK-NEXT: q0 = vcmp.gt(v14.w,v2.w) +; CHECK-NEXT: v29.w = vsub(v14.w,v28.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q3,v7,v27) +; CHECK-NEXT: q3 = vcmp.gt(v14.w,v1.w) +; CHECK-NEXT: v31.w = vsub(v14.w,v3.w) +; CHECK-NEXT: v5 = vmux(q2,v5,v25) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vmux(q0,v0,v20) +; CHECK-NEXT: v30 = vmux(q0,v29,v28) +; CHECK-NEXT: q2 = vcmp.gt(v23.w,v14.w) +; CHECK-NEXT: v3 = vmux(q3,v31,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vpack(v2.w,v5.w):sat +; CHECK-NEXT: v0 = vmux(q3,v0,v20) +; CHECK-NEXT: q3 = vcmp.gt(v6.w,v14.w) +; CHECK-NEXT: v1 = vmux(q2,v30,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v3,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.b = vpack(v3.h,v2.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.b = vpack(v3.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0 = vshuff(v0,v1,r7) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <128 x float>, ptr %a0, align 128 + %v1 = fptosi <128 x float> %v0 to <128 x i8> + store <128 x i8> %v1, ptr %a1, align 128 + ret void +} + +; Widen result #1 +define void @f32s8_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s8_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) +; CHECK-NEXT: r4 = #1 +; CHECK-NEXT: v1 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r3) +; CHECK-NEXT: r5 = #30 +; CHECK-NEXT: v4.w = vasl(v0.w,r4) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vasl(v1.w,r4) +; CHECK-NEXT: v4.w = vsub(v4.w,v3.w) +; CHECK-NEXT: r6 = #24 +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r5) +; CHECK-NEXT: v8 = vsplat(r4) +; CHECK-NEXT: v2.w = vasl(v1.w,r2) +; CHECK-NEXT: v5.w = vsub(v5.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vasr(v4.w,r6) +; CHECK-NEXT: v27 = vxor(v27,v27) +; CHECK-NEXT: v2 = vor(v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: v5.w = vasr(v5.w,r6) +; CHECK-NEXT: q0 = vcmp.gt(v27.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28 = vsplat(r3) +; CHECK-NEXT: v6.w = vasl(v0.w,r2) +; CHECK-NEXT: v4.w = vsub(v7.w,v4.w) +; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vsub(v7.w,v5.w) +; CHECK-NEXT: v4.w = vmin(v4.w,v8.w) +; CHECK-NEXT: v31 = vmux(q0,v3,v28) +; CHECK-NEXT: v6 = vor(v6,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vmin(v5.w,v8.w) +; CHECK-NEXT: q1 = vcmp.gt(v4.w,v27.w) +; CHECK-NEXT: v0 = vmux(q2,v3,v28) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v6.w = vlsr(v6.w,v4.w) +; CHECK-NEXT: q3 = vcmp.gt(v5.w,v27.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w) +; CHECK-NEXT: v29.w = vsub(v27.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.w = vsub(v27.w,v2.w) +; CHECK-NEXT: v1 = vmux(q0,v29,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q2,v30,v2) +; CHECK-NEXT: v1 = vmux(q1,v1,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v0 = vmux(q3,v2,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.b = vpack(v2.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <64 x float>, ptr %a0, align 128 + %v1 = fptosi <64 x float> %v0 to <64 x i8> + store <64 x i8> %v1, ptr %a1, align 128 + ret void +} + +; Widen result #2 +define void @f32s8_2(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s8_2: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#30,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: r5:4 = combine(#8,#24) +; CHECK-NEXT: r6 = #32 +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r3) +; CHECK-NEXT: v5 = vsplat(r6) +; CHECK-NEXT: v6.w = vasl(v0.w,r5) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r6) +; CHECK-NEXT: v3.w = vasr(v3.w,r4) +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = ##2147483647 +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r4) +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: q2 = vcmp.gt(v3.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.w = vsub(v1.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vpack(v0.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.b = vpack(v1.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x float>, ptr %a0, align 128 + %v1 = fptosi <32 x float> %v0 to <32 x i8> + store <32 x i8> %v1, ptr %a1, align 128 + ret void +} + +; f32 -> s16 +; No widening +define void @f32s16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##-2147483648,#1) +; CHECK-NEXT: r4 = #30 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r3) +; CHECK-NEXT: r6 = #8 +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: v1 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r4) +; CHECK-NEXT: v8.w = vasl(v1.w,r2) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5:4 = combine(#24,##2147483647) +; CHECK-NEXT: v7 = vxor(v7,v7) +; CHECK-NEXT: v8.w = vsub(v8.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = #32 +; CHECK-NEXT: v5.w = vasl(v0.w,r6) +; CHECK-NEXT: q1 = vcmp.gt(v7.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: v28.w = vasr(v3.w,r5) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: q0 = vcmp.gt(v7.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9 = vsplat(r4) +; CHECK-NEXT: v8.w = vasr(v8.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.w = vasl(v1.w,r6) +; CHECK-NEXT: v1.w = vsub(v4.w,v28.w) +; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) +; CHECK-NEXT: v29 = vmux(q0,v2,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vmin(v1.w,v6.w) +; CHECK-NEXT: v0 = vor(v27,v2) +; CHECK-NEXT: v4.w = vmin(v4.w,v6.w) +; CHECK-NEXT: v2 = vmux(q1,v2,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q2 = vcmp.gt(v1.w,v7.w) +; CHECK-NEXT: q3 = vcmp.gt(v4.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vlsr(v5.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.w = vlsr(v0.w,v4.w) +; CHECK-NEXT: v30.w = vsub(v7.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.w = vsub(v7.w,v0.w) +; CHECK-NEXT: v5 = vmux(q0,v30,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v31,v0) +; CHECK-NEXT: v1 = vmux(q2,v5,v29) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x float>, ptr %a0, align 128 + %v1 = fptosi <64 x float> %v0 to <64 x i16> + store <64 x i16> %v1, ptr %a1, align 128 + ret void +} + +; Widen result +define void @f32s16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: r6 = #30 +; CHECK-NEXT: r5 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v6.w = vasl(v0.w,r3) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.w = vasr(v3.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q2 = vcmp.gt(v3.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.w = vsub(v1.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vpack(v0.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x float>, ptr %a0, align 128 + %v1 = fptosi <32 x float> %v0 to <32 x i16> + store <32 x i16> %v1, ptr %a1, align 128 + ret void +} + +; f32 -> s32 +; No widening +define void @f32s32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: r6 = #30 +; CHECK-NEXT: r5 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v6.w = vasl(v0.w,r3) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v3.w = vasr(v3.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r2) +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vcmp.gt(v3.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.w = vsub(v1.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v0,v2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x float>, ptr %a0, align 128 + %v1 = fptosi <32 x float> %v0 to <32 x i32> + store <32 x i32> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @f32s32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32s32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: r6 = #30 +; CHECK-NEXT: r5 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v6.w = vasl(v0.w,r3) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v29 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.w = vasr(v3.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q0,v2,v30) +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q1 = vcmp.gt(v3.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.w = vsub(v1.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q0,v31,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v0,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <16 x float>, ptr %a0, align 128 + %v1 = fptosi <16 x float> %v0 to <16 x i32> + store <16 x i32> %v1, ptr %a1, align 128 + ret void +} + + +; f16 -> u8 +; No widening +define void @f16u8_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16u8_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##32768,#1) +; CHECK-NEXT: r4 = #14 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r3) +; CHECK-NEXT: r7:6 = combine(#11,#16) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: v1 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vsplat(r4) +; CHECK-NEXT: r5 = #5 +; CHECK-NEXT: v4.h = vasl(v1.h,r2) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vsplat(r6) +; CHECK-NEXT: v5.h = vasl(v0.h,r5) +; CHECK-NEXT: v4.h = vsub(v4.h,v2.h) +; CHECK-NEXT: v28 = vxor(v28,v28) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #32767 +; CHECK-NEXT: v3.h = vasr(v3.h,r7) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.h = vsplat(r2) +; CHECK-NEXT: v4.h = vasr(v4.h,r7) +; CHECK-NEXT: q2 = vcmp.gt(v28.h,v0.h) +; CHECK-NEXT: v3.h = vsub(v6.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.h = vasl(v1.h,r5) +; CHECK-NEXT: q3 = vcmp.gt(v28.h,v1.h) +; CHECK-NEXT: v4.h = vsub(v6.h,v4.h) +; CHECK-NEXT: v3.h = vmin(v3.h,v7.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vmin(v4.h,v7.h) +; CHECK-NEXT: v2 = vor(v8,v2) +; CHECK-NEXT: q0 = vcmp.gt(v28.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vlsr(v5.h,v3.h) +; CHECK-NEXT: q1 = vcmp.gt(v28.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vlsr(v2.h,v4.h) +; CHECK-NEXT: v30 = vmux(q0,v29,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q1,v29,v2) +; CHECK-NEXT: v0 = vmux(q2,v28,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vmux(q3,v28,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.ub = vpack(v1.h,v0.h):sat +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <128 x half>, ptr %a0, align 128 + %v1 = fptoui <128 x half> %v0 to <128 x i8> + store <128 x i8> %v1, ptr %a1, align 128 + ret void +} + +; Widen result +define void @f16u8_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16u8_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##32768 +; CHECK-NEXT: r3:2 = combine(#5,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: r6 = #14 +; CHECK-NEXT: r5 = #11 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r6) +; CHECK-NEXT: r4 = #16 +; CHECK-NEXT: v6.h = vasl(v0.h,r3) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r4) +; CHECK-NEXT: r3 = #32767 +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.h = vsplat(r3) +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.h = vasr(v3.h,r5) +; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v3.h = vsub(v4.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vmin(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.ub = vpack(v0.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <64 x half>, ptr %a0, align 128 + %v1 = fptoui <64 x half> %v0 to <64 x i8> + store <64 x i8> %v1, ptr %a1, align 128 + ret void +} + +; f16 -> u16 +; No widening +define void @f16u16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16u16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##32768 +; CHECK-NEXT: r3:2 = combine(#5,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: r6 = #14 +; CHECK-NEXT: r5 = #11 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r6) +; CHECK-NEXT: r4 = #16 +; CHECK-NEXT: v6.h = vasl(v0.h,r3) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r4) +; CHECK-NEXT: r2 = #32767 +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.h = vsplat(r2) +; CHECK-NEXT: v3.h = vasr(v3.h,r5) +; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vsub(v4.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vmin(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x half>, ptr %a0, align 128 + %v1 = fptoui <64 x half> %v0 to <64 x i16> + store <64 x i16> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @f16u16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16u16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##32768 +; CHECK-NEXT: r3:2 = combine(#5,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v3.h = vasl(v0.h,r2) +; CHECK-NEXT: r6 = #14 +; CHECK-NEXT: r5 = #11 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r6) +; CHECK-NEXT: r4 = #16 +; CHECK-NEXT: v6.h = vasl(v0.h,r3) +; CHECK-NEXT: v3.h = vsub(v3.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r4) +; CHECK-NEXT: r3 = #32767 +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.h = vsplat(r3) +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.h = vasr(v3.h,r5) +; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v3.h = vsub(v4.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vmin(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x half>, ptr %a0, align 128 + %v1 = fptoui <32 x half> %v0 to <32 x i16> + store <32 x i16> %v1, ptr %a1, align 128 + ret void +} + +; f16 -> u32 +; No widening +define void @f16u32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16u32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #15360 +; CHECK-NEXT: r7 = #-4 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vsplat(r2) +; CHECK-NEXT: r4 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#30,#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r4) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: r6 = #24 +; CHECK-NEXT: r0 = #8 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: v26 = vxor(v26,v26) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.sf = v0.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.sf = v1.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q1 = vcmp.gt(v26.w,v1.w) +; CHECK-NEXT: q3 = vcmp.gt(v26.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v1.w,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v4.w = vasl(v0.w,r2) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27 = vsplat(r2) +; CHECK-NEXT: v5.w = vasl(v1.w,r0) +; CHECK-NEXT: v4.w = vsub(v4.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasr(v3.w,r6) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vasr(v4.w,r6) +; CHECK-NEXT: v3.w = vsub(v6.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v0.w,r0) +; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) +; CHECK-NEXT: v3.w = vmin(v3.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vmin(v4.w,v7.w) +; CHECK-NEXT: v2 = vor(v8,v2) +; CHECK-NEXT: q0 = vcmp.gt(v26.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vlsr(v5.w,v3.w) +; CHECK-NEXT: q2 = vcmp.gt(v26.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.w = vlsr(v2.w,v4.w) +; CHECK-NEXT: v29 = vmux(q0,v27,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q2,v27,v28) +; CHECK-NEXT: v31 = vmux(q1,v26,v29) +; CHECK-NEXT: vmem(r1+#1) = v31.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v26,v30) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x half>, ptr %a0, align 128 + %v1 = fptoui <64 x half> %v0 to <64 x i32> + store <64 x i32> %v1, ptr %a1, align 128 + ret void +} + +; Widen input +define void @f16u32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f16u32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #15360 +; CHECK-NEXT: r7 = #-4 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vsplat(r4) +; CHECK-NEXT: r2 = ##-2147483648 +; CHECK-NEXT: r3 = #1 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r2) +; CHECK-NEXT: r5:4 = combine(#8,#30) +; CHECK-NEXT: r6 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r4) +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf) +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v30 = vsplat(r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.sf = v0.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.sf = v1.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q1 = vcmp.gt(v2.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v0.w,r3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v0.w,r5) +; CHECK-NEXT: v1.w = vsub(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasr(v1.w,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v4.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vmin(v1.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v2.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vlsr(v3.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v2,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x half>, ptr %a0, align 128 + %v1 = fptoui <32 x half> %v0 to <32 x i32> + store <32 x i32> %v1, ptr %a1, align 128 + ret void +} + +; f32 -> u8 +; No widening +define void @f32u8_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u8_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) +; CHECK-NEXT: r4 = #1 +; CHECK-NEXT: v5 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r3) +; CHECK-NEXT: r5 = #30 +; CHECK-NEXT: r6 = #24 +; CHECK-NEXT: v2 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v14 = vsplat(r5) +; CHECK-NEXT: v8.w = vasl(v5.w,r4) +; CHECK-NEXT: v13 = vxor(v13,v13) +; CHECK-NEXT: v0 = vmem(r0+#2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7 = #64 +; CHECK-NEXT: v9.w = vasl(v2.w,r4) +; CHECK-NEXT: v8.w = vsub(v8.w,v4.w) +; CHECK-NEXT: v1 = vmem(r0+#3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.w = vasl(v0.w,r4) +; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) +; CHECK-NEXT: v9.w = vsub(v9.w,v4.w) +; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v12.w = vasl(v1.w,r4) +; CHECK-NEXT: v11.w = vsub(v11.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24 = vsplat(r4) +; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: v12.w = vsub(v12.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.w = vasr(v9.w,r6) +; CHECK-NEXT: v8.w = vsub(v14.w,v8.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v5.w,r2) +; CHECK-NEXT: v9.w = vsub(v14.w,v9.w) +; CHECK-NEXT: v8.w = vmin(v8.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.w = vasl(v2.w,r2) +; CHECK-NEXT: v6 = vor(v6,v4) +; CHECK-NEXT: v9.w = vmin(v9.w,v24.w) +; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.w = vasr(v11.w,r6) +; CHECK-NEXT: v7 = vor(v7,v4) +; CHECK-NEXT: q2 = vcmp.gt(v13.w,v9.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.w = vasr(v12.w,r6) +; CHECK-NEXT: v5.w = vsub(v14.w,v23.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v1.w,r2) +; CHECK-NEXT: v25.w = vsub(v14.w,v12.w) +; CHECK-NEXT: v5.w = vmin(v5.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v10.w = vasl(v0.w,r2) +; CHECK-NEXT: v3 = vor(v3,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26 = vsplat(r2) +; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w) +; CHECK-NEXT: v10 = vor(v10,v4) +; CHECK-NEXT: v4.w = vmin(v25.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w) +; CHECK-NEXT: v6 = vmux(q1,v26,v6) +; CHECK-NEXT: q1 = vcmp.gt(v13.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.w = vlsr(v10.w,v5.w) +; CHECK-NEXT: v7 = vmux(q2,v26,v7) +; CHECK-NEXT: q2 = vcmp.gt(v13.w,v4.w) +; CHECK-NEXT: v28 = vmux(q0,v13,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w) +; CHECK-NEXT: v29 = vmux(q3,v13,v7) +; CHECK-NEXT: v2 = vmux(q1,v26,v27) +; CHECK-NEXT: q1 = vcmp.gt(v13.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vcmp.gt(v13.w,v1.w) +; CHECK-NEXT: v0 = vmux(q2,v26,v3) +; CHECK-NEXT: v1 = vmux(q1,v13,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uh = vpack(v29.w,v28.w):sat +; CHECK-NEXT: v0 = vmux(q3,v13,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.uh = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.uh = vpack(v0.w,v1.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.ub = vpack(v31.h,v30.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.ub = vpack(v31.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1:0 = vshuff(v0,v1,r7) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <128 x float>, ptr %a0, align 128 + %v1 = fptoui <128 x float> %v0 to <128 x i8> + store <128 x i8> %v1, ptr %a1, align 128 + ret void +} + +; Widen result #1 +define void @f32u8_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u8_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##-2147483648,#1) +; CHECK-NEXT: r4 = #30 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r3) +; CHECK-NEXT: r7:6 = combine(#24,#32) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: v1 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r4) +; CHECK-NEXT: r5 = #8 +; CHECK-NEXT: v4.w = vasl(v1.w,r2) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r6) +; CHECK-NEXT: v5.w = vasl(v0.w,r5) +; CHECK-NEXT: v4.w = vsub(v4.w,v2.w) +; CHECK-NEXT: v27 = vxor(v27,v27) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: v3.w = vasr(v3.w,r7) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28 = vsplat(r3) +; CHECK-NEXT: v4.w = vasr(v4.w,r7) +; CHECK-NEXT: q2 = vcmp.gt(v27.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v6.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v8.w = vasl(v1.w,r5) +; CHECK-NEXT: q3 = vcmp.gt(v27.w,v1.w) +; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vmin(v3.w,v7.w) +; CHECK-NEXT: v4.w = vmin(v4.w,v7.w) +; CHECK-NEXT: v2 = vor(v8,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v27.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v27.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w) +; CHECK-NEXT: v29 = vmux(q0,v28,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q1,v28,v2) +; CHECK-NEXT: v0 = vmux(q2,v27,v29) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v1 = vmux(q3,v27,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.uh = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.ub = vpack(v31.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <64 x float>, ptr %a0, align 128 + %v1 = fptoui <64 x float> %v0 to <64 x i8> + store <64 x i8> %v1, ptr %a1, align 128 + ret void +} + +; Widen result #2 +define void @f32u8_2(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u8_2: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#30,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: r5:4 = combine(#8,#24) +; CHECK-NEXT: r6 = #32 +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r3) +; CHECK-NEXT: v5 = vsplat(r6) +; CHECK-NEXT: v6.w = vasl(v0.w,r5) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r6) +; CHECK-NEXT: v3.w = vasr(v3.w,r4) +; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = ##2147483647 +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r4) +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uh = vpack(v1.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.uh = vpack(v0.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.ub = vpack(v1.h,v0.h):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x float>, ptr %a0, align 128 + %v1 = fptoui <32 x float> %v0 to <32 x i8> + store <32 x i8> %v1, ptr %a1, align 128 + ret void +} + +; f32 -> u16 +; No widening +define void @f32u16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##-2147483648,#1) +; CHECK-NEXT: r4 = #30 +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r3) +; CHECK-NEXT: r7:6 = combine(#24,#32) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: v1 = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r4) +; CHECK-NEXT: r5 = #8 +; CHECK-NEXT: v4.w = vasl(v1.w,r2) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r6) +; CHECK-NEXT: v5.w = vasl(v0.w,r5) +; CHECK-NEXT: v4.w = vsub(v4.w,v2.w) +; CHECK-NEXT: v28 = vxor(v28,v28) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v3.w = vasr(v3.w,r7) +; CHECK-NEXT: v5 = vor(v5,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29 = vsplat(r2) +; CHECK-NEXT: v4.w = vasr(v4.w,r7) +; CHECK-NEXT: q2 = vcmp.gt(v28.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v6.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v1.w,r5) +; CHECK-NEXT: q3 = vcmp.gt(v28.w,v1.w) +; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) +; CHECK-NEXT: v3.w = vmin(v3.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vmin(v4.w,v7.w) +; CHECK-NEXT: v2 = vor(v8,v2) +; CHECK-NEXT: q0 = vcmp.gt(v28.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v28.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w) +; CHECK-NEXT: v30 = vmux(q0,v29,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q1,v29,v2) +; CHECK-NEXT: v0 = vmux(q2,v28,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vmux(q3,v28,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x float>, ptr %a0, align 128 + %v1 = fptoui <64 x float> %v0 to <64 x i16> + store <64 x i16> %v1, ptr %a1, align 128 + ret void +} + +; Widen result +define void @f32u16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: r6 = #30 +; CHECK-NEXT: r5 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v6.w = vasl(v0.w,r3) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.w = vasr(v3.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.uh = vpack(v0.w,v0.w):sat +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x float>, ptr %a0, align 128 + %v1 = fptoui <32 x float> %v0 to <32 x i16> + store <32 x i16> %v1, ptr %a1, align 128 + ret void +} + +; f32 -> u32 +; No widening +define void @f32u32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: r6 = #30 +; CHECK-NEXT: r5 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v6.w = vasl(v0.w,r3) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: v3.w = vasr(v3.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r2) +; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x float>, ptr %a0, align 128 + %v1 = fptoui <32 x float> %v0 to <32 x i32> + store <32 x i32> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @f32u32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: f32u32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3.w = vasl(v0.w,r2) +; CHECK-NEXT: r6 = #30 +; CHECK-NEXT: r5 = #24 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v6.w = vasl(v0.w,r3) +; CHECK-NEXT: v3.w = vsub(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r4) +; CHECK-NEXT: v2 = vor(v6,v2) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = ##2147483647 +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.w = vasr(v3.w,r5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vsplat(r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v3.w = vsub(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vmin(v3.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vmux(q0,v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q1,v1,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <16 x float>, ptr %a0, align 128 + %v1 = fptoui <16 x float> %v0 to <16 x i32> + store <16 x i32> %v1, ptr %a1, align 128 + ret void +} + + +attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll new file mode 100644 index 0000000..260bee82 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll @@ -0,0 +1,2744 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; s8 -> f16 +; No widening +define void @s8f16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s8f16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##.LCPI0_0 +; CHECK-NEXT: v1:0.h = vunpack(v2.b) +; CHECK-NEXT: v2.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7 = #1 +; CHECK-NEXT: v4.h = vabs(v0.h) +; CHECK-NEXT: v1 = vmem(r2+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r7) +; CHECK-NEXT: r5:4 = combine(#31,#5) +; CHECK-NEXT: v1 = vdelta(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v21.h = vsplat(r5) +; CHECK-NEXT: r6 = #64 +; CHECK-NEXT: v6.uh = vcl0(v4.uh) +; CHECK-NEXT: v10 = vxor(v10,v10) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vsplat(r6) +; CHECK-NEXT: r5 = ##32768 +; CHECK-NEXT: v3:2.h = vunpack(v1.b) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.h = vsplat(r5) +; CHECK-NEXT: v20.h = vadd(v6.h,v5.h) +; CHECK-NEXT: v3.h = vabs(v2.h) +; CHECK-NEXT: q1 = vcmp.gt(v10.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vasl(v4.h,v20.h) +; CHECK-NEXT: v29 = vmux(q1,v28,v10) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.uh = vcl0(v3.uh) +; CHECK-NEXT: v9.h = vadd(v4.h,v21.h) +; CHECK-NEXT: v11 = vand(v4,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uh = vlsr(v4.uh,r4) +; CHECK-NEXT: v8.h = vadd(v8.h,v5.h) +; CHECK-NEXT: q2 = vcmp.gt(v4.uh,v9.uh) +; CHECK-NEXT: q0 = vcmp.eq(v11.h,v10.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v22.uh = vlsr(v9.uh,r4) +; CHECK-NEXT: v25 = vmux(q2,v5,v10) +; CHECK-NEXT: v13 = vmux(q0,v10,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vasl(v3.h,v8.h) +; CHECK-NEXT: v13.h = vadd(v22.h,v13.h) +; CHECK-NEXT: q0 = vcmp.eq(v12.h,v22.h) +; CHECK-NEXT: v12.h = vadd(v25.h,v21.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uh = vlsr(v22.uh,r7) +; CHECK-NEXT: v23.h = vadd(v3.h,v21.h) +; CHECK-NEXT: v7 = vand(v3,v7) +; CHECK-NEXT: v1.h = vsub(v12.h,v20.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uh = vlsr(v3.uh,r4) +; CHECK-NEXT: q2 = vcmp.eq(v7.h,v10.h) +; CHECK-NEXT: q3 = vcmp.gt(v3.uh,v23.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uh = vlsr(v23.uh,r4) +; CHECK-NEXT: v7 = vmux(q2,v10,v5) +; CHECK-NEXT: v5 = vmux(q3,v5,v10) +; CHECK-NEXT: q3 = vcmp.gt(v10.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26.uh = vlsr(v13.uh,r7) +; CHECK-NEXT: v7.h = vadd(v4.h,v7.h) +; CHECK-NEXT: v5.h = vadd(v5.h,v21.h) +; CHECK-NEXT: q2 = vcmp.eq(v24.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #10 +; CHECK-NEXT: v4.uh = vlsr(v4.uh,r7) +; CHECK-NEXT: v5.h = vsub(v5.h,v8.h) +; CHECK-NEXT: v30 = vmux(q3,v28,v10) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uh = vlsr(v7.uh,r7) +; CHECK-NEXT: v3 = vmux(q0,v26,v27) +; CHECK-NEXT: q3 = vcmp.eq(v2.h,v10.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vasl(v5.h,r4) +; CHECK-NEXT: v4 = vmux(q2,v7,v4) +; CHECK-NEXT: v3 = vor(v30,v3) +; CHECK-NEXT: q2 = vcmp.eq(v0.h,v10.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v1.h,r4) +; CHECK-NEXT: v4 = vor(v29,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vor(v3,v1) +; CHECK-NEXT: v31 = vor(v4,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vmux(q2,v10,v1) +; CHECK-NEXT: v0 = vmux(q3,v10,v31) +; CHECK-NEXT: vmem(r1+#0) = v1.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v0 +; CHECK-NEXT: } + %v0 = load <128 x i8>, ptr %a0, align 128 + %v1 = sitofp <128 x i8> %v0 to <128 x half> + store <128 x half> %v1, ptr %a1, align 128 + ret void +} + +; Widen input +define void @s8f16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s8f16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r6 = #1 +; CHECK-NEXT: r3:2 = combine(#64,#31) +; CHECK-NEXT: v1:0.h = vunpack(v0.b) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vsplat(r6) +; CHECK-NEXT: v4.h = vsplat(r2) +; CHECK-NEXT: v2.h = vabs(v0.h) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vsplat(r3) +; CHECK-NEXT: r5:4 = combine(##32768,#5) +; CHECK-NEXT: r2 = #10 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.h = vsplat(r5) +; CHECK-NEXT: v5.uh = vcl0(v2.uh) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vasl(v2.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v2.h,v4.h) +; CHECK-NEXT: v6 = vand(v2,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.uh = vlsr(v2.uh,r4) +; CHECK-NEXT: q0 = vcmp.eq(v6.h,v1.h) +; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v7.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4) +; CHECK-NEXT: v26 = vmux(q0,v1,v3) +; CHECK-NEXT: v3 = vmux(q1,v3,v1) +; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) +; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) +; CHECK-NEXT: q2 = vcmp.eq(v2.h,v25.h) +; CHECK-NEXT: v30 = vmux(q1,v8,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6) +; CHECK-NEXT: v28.h = vsub(v3.h,v5.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vasl(v28.h,r2) +; CHECK-NEXT: v3 = vmux(q2,v29,v27) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vor(v30,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v3,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v1,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i8>, ptr %a0, align 128 + %v1 = sitofp <64 x i8> %v0 to <64 x half> + store <64 x half> %v1, ptr %a1, align 128 + ret void +} + + +; s8 -> f32 +; No widening +define void @s8f32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s8f32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##.LCPI2_0,#8) +; CHECK-NEXT: v3:2.h = vunpack(v1.b) +; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r7 = #512 +; CHECK-NEXT: r4 = #255 +; CHECK-NEXT: v3 = vmem(r3+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vsplat(r0) +; CHECK-NEXT: v13 = vsplat(r7) +; CHECK-NEXT: v4 = vdelta(v1,v3) +; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10 = vsplat(r4) +; CHECK-NEXT: r6 = ##-2147483648 +; CHECK-NEXT: v3:2.w = vunpack(v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v15 = vsplat(r6) +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: v5:4.h = vunpack(v4.b) +; CHECK-NEXT: v6.w = vabs(v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v17 = vsplat(r5) +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v8.w = vabs(v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5:4.w = vunpack(v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.uw = vcl0(v6.uw) +; CHECK-NEXT: v7.w = vabs(v4.w) +; CHECK-NEXT: v11.w = vabs(v5.w) +; CHECK-NEXT: q0 = vcmp.gt(v0.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v14.uw = vcl0(v8.uw) +; CHECK-NEXT: v9.w = vadd(v9.w,v1.w) +; CHECK-NEXT: v18 = vmux(q0,v15,v0) +; CHECK-NEXT: q1 = vcmp.gt(v0.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uw = vcl0(v7.uw) +; CHECK-NEXT: v14.w = vadd(v14.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v16.uw = vcl0(v11.uw) +; CHECK-NEXT: v12.w = vadd(v12.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v6.w,v9.w) +; CHECK-NEXT: v16.w = vadd(v16.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.w = vasl(v7.w,v12.w) +; CHECK-NEXT: v19 = vand(v6,v13) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.w = vasl(v11.w,v16.w) +; CHECK-NEXT: v21 = vand(v7,v13) +; CHECK-NEXT: v31.w = vadd(v7.w,v10.w) +; CHECK-NEXT: q0 = vcmp.eq(v19.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v8.w,v14.w) +; CHECK-NEXT: v22.w = vadd(v11.w,v10.w) +; CHECK-NEXT: q3 = vcmp.eq(v21.w,v0.w) +; CHECK-NEXT: v24 = vand(v11,v13) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v31.uw,r2) +; CHECK-NEXT: v29 = vmux(q3,v0,v1) +; CHECK-NEXT: q3 = vcmp.eq(v24.w,v0.w) +; CHECK-NEXT: q2 = vcmp.gt(v7.uw,v31.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v11.uw,r2) +; CHECK-NEXT: v27 = vmux(q3,v0,v1) +; CHECK-NEXT: v19.w = vadd(v23.w,v29.w) +; CHECK-NEXT: v31 = vmux(q2,v1,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v22.uw,r2) +; CHECK-NEXT: v13 = vand(v8,v13) +; CHECK-NEXT: v26 = vmux(q0,v0,v1) +; CHECK-NEXT: v12.w = vsub(v31.w,v12.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20.uw = vlsr(v7.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v28.w,v30.w) +; CHECK-NEXT: v28.w = vadd(v30.w,v27.w) +; CHECK-NEXT: v31 = vmux(q1,v15,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uw = vlsr(v30.uw,r0) +; CHECK-NEXT: v30.w = vadd(v6.w,v10.w) +; CHECK-NEXT: q2 = vcmp.eq(v20.w,v23.w) +; CHECK-NEXT: v10.w = vadd(v8.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: q0 = vcmp.gt(v8.uw,v10.uw) +; CHECK-NEXT: v12.w = vadd(v12.w,v17.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: v7 = vmux(q3,v7,v29) +; CHECK-NEXT: q3 = vcmp.eq(v13.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v19.uw = vlsr(v19.uw,r0) +; CHECK-NEXT: v29 = vmux(q3,v0,v1) +; CHECK-NEXT: v7 = vor(v31,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) +; CHECK-NEXT: v19 = vmux(q2,v19,v23) +; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v22.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: v27 = vmux(q2,v1,v0) +; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v30.uw) +; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v31 = vmux(q2,v1,v0) +; CHECK-NEXT: v1 = vmux(q0,v1,v0) +; CHECK-NEXT: v30.w = vadd(v10.w,v29.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v1.w = vsub(v1.w,v14.w) +; CHECK-NEXT: q3 = vcmp.eq(v6.w,v25.w) +; CHECK-NEXT: v21.w = vsub(v31.w,v9.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v6.w = vadd(v21.w,v17.w) +; CHECK-NEXT: v1.w = vadd(v1.w,v17.w) +; CHECK-NEXT: q0 = vcmp.eq(v24.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v22.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: v13.w = vsub(v27.w,v16.w) +; CHECK-NEXT: q2 = vcmp.gt(v0.w,v3.w) +; CHECK-NEXT: v18 = vor(v18,v19) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v30.uw,r0) +; CHECK-NEXT: v8 = vmux(q3,v8,v22) +; CHECK-NEXT: q3 = vcmp.gt(v0.w,v2.w) +; CHECK-NEXT: v26 = vmux(q2,v15,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uw = vlsr(v10.uw,r0) +; CHECK-NEXT: v25.w = vadd(v13.w,v17.w) +; CHECK-NEXT: v27 = vmux(q3,v15,v0) +; CHECK-NEXT: v8 = vor(v26,v8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v6.w,r4) +; CHECK-NEXT: v9 = vmux(q0,v23,v24) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v9 = vor(v27,v9) +; CHECK-NEXT: v6 = vor(v8,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.w = vasl(v12.w,r4) +; CHECK-NEXT: v1 = vor(v9,v1) +; CHECK-NEXT: v29 = vmux(q2,v0,v6) +; CHECK-NEXT: vmem(r1+#1) = v29.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.w = vasl(v25.w,r4) +; CHECK-NEXT: v1 = vmux(q3,v0,v1) +; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w) +; CHECK-NEXT: vmem(r1+#0) = v1.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vor(v7,v28) +; CHECK-NEXT: v31 = vor(v18,v12) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vmux(q2,v0,v30) +; CHECK-NEXT: v0 = vmux(q3,v0,v31) +; CHECK-NEXT: vmem(r1+#3) = v2.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#2) = v0 +; CHECK-NEXT: } + %v0 = load <128 x i8>, ptr %a0, align 128 + %v1 = sitofp <128 x i8> %v0 to <128 x float> + store <128 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input #1 +define void @s8f32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s8f32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: v3:2.h = vunpack(v0.b) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vsplat(r0) +; CHECK-NEXT: r3:2 = combine(##255,#8) +; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r3) +; CHECK-NEXT: v3:2.w = vunpack(v2.h) +; CHECK-NEXT: v22 = vxor(v22,v22) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10 = vsplat(r6) +; CHECK-NEXT: r7 = ##-2147483648 +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9 = vsplat(r7) +; CHECK-NEXT: v4.w = vabs(v2.w) +; CHECK-NEXT: v5.w = vabs(v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12 = vsplat(r5) +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v11 = vmux(q0,v9,v22) +; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vcl0(v4.uw) +; CHECK-NEXT: v30 = vmux(q0,v9,v22) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.uw = vcl0(v5.uw) +; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vadd(v8.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vasl(v4.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vasl(v5.w,v8.w) +; CHECK-NEXT: v13 = vand(v4,v10) +; CHECK-NEXT: v14.w = vadd(v4.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10 = vand(v5,v10) +; CHECK-NEXT: v7.w = vadd(v5.w,v7.w) +; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw) +; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w) +; CHECK-NEXT: v25 = vmux(q2,v1,v22) +; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) +; CHECK-NEXT: v26 = vmux(q1,v22,v1) +; CHECK-NEXT: v27 = vmux(q3,v22,v1) +; CHECK-NEXT: v1 = vmux(q2,v1,v22) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) +; CHECK-NEXT: v5.w = vadd(v14.w,v26.w) +; CHECK-NEXT: v29.w = vadd(v7.w,v27.w) +; CHECK-NEXT: v6.w = vsub(v25.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v1.w = vsub(v1.w,v8.w) +; CHECK-NEXT: v6.w = vadd(v6.w,v12.w) +; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0) +; CHECK-NEXT: v1.w = vadd(v1.w,v12.w) +; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0) +; CHECK-NEXT: v5 = vmux(q1,v5,v28) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: v5 = vor(v11,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v6.w,r4) +; CHECK-NEXT: v4 = vmux(q3,v4,v7) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v4 = vor(v30,v4) +; CHECK-NEXT: v31 = vor(v5,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vor(v4,v1) +; CHECK-NEXT: v0 = vmux(q3,v22,v31) +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vmux(q2,v22,v1) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v1.new +; CHECK-NEXT: } + %v0 = load <64 x i8>, ptr %a0, align 128 + %v1 = sitofp <64 x i8> %v0 to <64 x float> + store <64 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input #2 +define void @s8f32_2(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s8f32_2: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v1:0.h = vunpack(v0.b) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r0) +; CHECK-NEXT: v4 = vsplat(r3) +; CHECK-NEXT: r2 = #255 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v1:0.w = vunpack(v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vsplat(r2) +; CHECK-NEXT: v8 = vsplat(r4) +; CHECK-NEXT: v5.w = vabs(v0.w) +; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r7) +; CHECK-NEXT: r2 = #23 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vcl0(v5.uw) +; CHECK-NEXT: v30 = vmux(q2,v7,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vasl(v5.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vadd(v5.w,v1.w) +; CHECK-NEXT: v4 = vand(v5,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) +; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) +; CHECK-NEXT: v4 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v6.w) +; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0) +; CHECK-NEXT: v2.w = vadd(v2.w,v8.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vasl(v2.w,r2) +; CHECK-NEXT: v1 = vmux(q3,v29,v28) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vor(v30,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v1,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v3,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x i8>, ptr %a0, align 128 + %v1 = sitofp <32 x i8> %v0 to <32 x float> + store <32 x float> %v1, ptr %a1, align 128 + ret void +} + + +; s16 -> f16 +; No widening +define void @s16f16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s16f16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r6 = #1 +; CHECK-NEXT: r3:2 = combine(#64,#31) +; CHECK-NEXT: v1.h = vabs(v0.h) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vsplat(r6) +; CHECK-NEXT: v5.h = vsplat(r2) +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vsplat(r3) +; CHECK-NEXT: r5:4 = combine(##32768,#5) +; CHECK-NEXT: v4.uh = vcl0(v1.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.h = vsplat(r5) +; CHECK-NEXT: r2 = #10 +; CHECK-NEXT: v4.h = vadd(v4.h,v3.h) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v1.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v1.h,v5.h) +; CHECK-NEXT: v6 = vand(v1,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uh = vlsr(v1.uh,r4) +; CHECK-NEXT: q0 = vcmp.eq(v6.h,v2.h) +; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4) +; CHECK-NEXT: v26 = vmux(q0,v2,v3) +; CHECK-NEXT: v3 = vmux(q1,v3,v2) +; CHECK-NEXT: q1 = vcmp.gt(v2.h,v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) +; CHECK-NEXT: v3.h = vadd(v3.h,v5.h) +; CHECK-NEXT: q2 = vcmp.eq(v1.h,v25.h) +; CHECK-NEXT: v30 = vmux(q1,v8,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6) +; CHECK-NEXT: v28.h = vsub(v3.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v28.h,r2) +; CHECK-NEXT: v3 = vmux(q2,v29,v27) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vor(v30,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v3,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v2,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i16>, ptr %a0, align 128 + %v1 = sitofp <64 x i16> %v0 to <64 x half> + store <64 x half> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @s16f16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s16f16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r7 = #1 +; CHECK-NEXT: r3:2 = combine(#31,#64) +; CHECK-NEXT: v1.h = vabs(v0.h) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v5.h = vsplat(r3) +; CHECK-NEXT: r6 = #5 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vsplat(r2) +; CHECK-NEXT: r4 = ##32768 +; CHECK-NEXT: v4.uh = vcl0(v1.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.h = vsplat(r4) +; CHECK-NEXT: r3 = #10 +; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h) +; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q2,v8,v3) +; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v1.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v1.h,v5.h) +; CHECK-NEXT: v6 = vand(v1,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6) +; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h) +; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6) +; CHECK-NEXT: v26 = vmux(q1,v3,v2) +; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) +; CHECK-NEXT: v2.h = vadd(v2.h,v5.h) +; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uh = vlsr(v25.uh,r7) +; CHECK-NEXT: v28.h = vsub(v2.h,v4.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v28.h,r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v2 = vmux(q3,v29,v27) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vor(v30,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v3,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x i16>, ptr %a0, align 128 + %v1 = sitofp <32 x i16> %v0 to <32 x half> + store <32 x half> %v1, ptr %a1, align 128 + ret void +} + + +; s16 -> f32 +; No widening +define void @s16f32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s16f32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##.LCPI7_0 +; CHECK-NEXT: v1:0.w = vunpack(v2.h) +; CHECK-NEXT: v2.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: v4.w = vabs(v0.w) +; CHECK-NEXT: v1 = vmem(r2+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r0) +; CHECK-NEXT: r5:4 = combine(##255,#8) +; CHECK-NEXT: v1 = vdelta(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20 = vsplat(r5) +; CHECK-NEXT: r7 = #512 +; CHECK-NEXT: v6.uw = vcl0(v4.uw) +; CHECK-NEXT: v10 = vxor(v10,v10) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r7) +; CHECK-NEXT: r6 = #159 +; CHECK-NEXT: r5 = ##-2147483648 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23 = vsplat(r6) +; CHECK-NEXT: v3:2.w = vunpack(v1.h) +; CHECK-NEXT: v19.w = vadd(v6.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28 = vsplat(r5) +; CHECK-NEXT: v3.w = vabs(v2.w) +; CHECK-NEXT: q0 = vcmp.gt(v10.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vasl(v4.w,v19.w) +; CHECK-NEXT: v29 = vmux(q0,v28,v10) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.uw = vcl0(v3.uw) +; CHECK-NEXT: v9.w = vadd(v4.w,v20.w) +; CHECK-NEXT: v11 = vand(v4,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uw = vlsr(v4.uw,r4) +; CHECK-NEXT: v8.w = vadd(v8.w,v5.w) +; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v9.uw) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v21.uw = vlsr(v9.uw,r4) +; CHECK-NEXT: v9 = vmux(q2,v5,v10) +; CHECK-NEXT: v22 = vmux(q1,v10,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v3.w,v8.w) +; CHECK-NEXT: v4.w = vadd(v21.w,v22.w) +; CHECK-NEXT: v1.w = vsub(v9.w,v19.w) +; CHECK-NEXT: q1 = vcmp.eq(v12.w,v21.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uw = vlsr(v21.uw,r0) +; CHECK-NEXT: v6.w = vadd(v3.w,v20.w) +; CHECK-NEXT: v7 = vand(v3,v7) +; CHECK-NEXT: v1.w = vadd(v1.w,v23.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uw = vlsr(v3.uw,r4) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v10.w) +; CHECK-NEXT: q3 = vcmp.gt(v3.uw,v6.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uw = vlsr(v6.uw,r4) +; CHECK-NEXT: v26 = vmux(q2,v10,v5) +; CHECK-NEXT: v5 = vmux(q3,v5,v10) +; CHECK-NEXT: q3 = vcmp.gt(v10.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r0) +; CHECK-NEXT: v6.w = vadd(v25.w,v26.w) +; CHECK-NEXT: v5.w = vsub(v5.w,v8.w) +; CHECK-NEXT: q2 = vcmp.eq(v24.w,v25.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v3.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: v5.w = vadd(v5.w,v23.w) +; CHECK-NEXT: v30 = vmux(q3,v28,v10) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r0) +; CHECK-NEXT: v4 = vmux(q1,v4,v27) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vasl(v5.w,r4) +; CHECK-NEXT: v3 = vmux(q2,v6,v3) +; CHECK-NEXT: v4 = vor(v30,v4) +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v3 = vor(v29,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vor(v4,v1) +; CHECK-NEXT: v31 = vor(v3,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vmux(q2,v10,v1) +; CHECK-NEXT: v0 = vmux(q3,v10,v31) +; CHECK-NEXT: vmem(r1+#0) = v1.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v0 +; CHECK-NEXT: } + %v0 = load <64 x i16>, ptr %a0, align 128 + %v1 = sitofp <64 x i16> %v0 to <64 x float> + store <64 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input +define void @s16f32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s16f32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r2 = #255 +; CHECK-NEXT: v1:0.w = vunpack(v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r0) +; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v2.w = vabs(v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) +; CHECK-NEXT: v1 = vxor(v1,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v5.uw = vcl0(v2.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v29 = vsplat(r7) +; CHECK-NEXT: q2 = vcmp.gt(v1.w,v0.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #23 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vasl(v2.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vadd(v2.w,v4.w) +; CHECK-NEXT: v6 = vand(v2,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.uw = vlsr(v2.uw,r6) +; CHECK-NEXT: q0 = vcmp.eq(v6.w,v1.w) +; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v4.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r6) +; CHECK-NEXT: v6 = vmux(q0,v1,v3) +; CHECK-NEXT: v3 = vmux(q1,v3,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vadd(v4.w,v6.w) +; CHECK-NEXT: v27.w = vsub(v3.w,v5.w) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v4.uw,r0) +; CHECK-NEXT: v2.w = vadd(v27.w,v7.w) +; CHECK-NEXT: v4 = vmux(q2,v29,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vasl(v2.w,r2) +; CHECK-NEXT: v3 = vmux(q3,v30,v28) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vor(v4,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v3,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v1,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x i16>, ptr %a0, align 128 + %v1 = sitofp <32 x i16> %v0 to <32 x float> + store <32 x float> %v1, ptr %a1, align 128 + ret void +} + + +; s32 -> f16 +; No widening +define void @s32f16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s32f16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: r6 = #255 +; CHECK-NEXT: v2.w = vabs(v1.w) +; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: r4 = #512 +; CHECK-NEXT: v3.w = vabs(v0.w) +; CHECK-NEXT: v0.cur = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9 = vsplat(r4) +; CHECK-NEXT: v8 = vsplat(r6) +; CHECK-NEXT: v5.uw = vcl0(v2.uw) +; CHECK-NEXT: v7 = vxor(v7,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v6.uw = vcl0(v3.uw) +; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28 = vsplat(r4) +; CHECK-NEXT: r5 = ##-2147483648 +; CHECK-NEXT: v6.w = vadd(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v13 = vsplat(r5) +; CHECK-NEXT: v2.w = vasl(v2.w,v5.w) +; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v3.w,v6.w) +; CHECK-NEXT: v27 = vmux(q0,v13,v7) +; CHECK-NEXT: v10.w = vadd(v2.w,v8.w) +; CHECK-NEXT: v11 = vand(v2,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9 = vand(v3,v9) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w) +; CHECK-NEXT: v8.w = vadd(v3.w,v8.w) +; CHECK-NEXT: q2 = vcmp.gt(v2.uw,v10.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uw = vlsr(v2.uw,r3) +; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w) +; CHECK-NEXT: v23 = vmux(q1,v7,v4) +; CHECK-NEXT: q1 = vcmp.gt(v3.uw,v8.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.uw = vlsr(v10.uw,r3) +; CHECK-NEXT: v25 = vmux(q3,v7,v4) +; CHECK-NEXT: v24 = vmux(q2,v4,v7) +; CHECK-NEXT: v4 = vmux(q1,v4,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) +; CHECK-NEXT: v9.w = vadd(v2.w,v23.w) +; CHECK-NEXT: v5.w = vsub(v24.w,v5.w) +; CHECK-NEXT: v4.w = vsub(v4.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.uw = vlsr(v3.uw,r3) +; CHECK-NEXT: v26.w = vadd(v8.w,v25.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v28.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v8.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v28.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uw = vlsr(v26.uw,r2) +; CHECK-NEXT: v2 = vmux(q3,v9,v2) +; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v13,v7) +; CHECK-NEXT: v2 = vor(v27,v2) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vasl(v5.w,r3) +; CHECK-NEXT: v3 = vmux(q2,v29,v3) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v4.w,r3) +; CHECK-NEXT: v31 = vor(v30,v3) +; CHECK-NEXT: v2 = vor(v2,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vor(v31,v3) +; CHECK-NEXT: v2 = vmux(q2,v7,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v7,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.qf32 = vadd(v2.sf,v7.sf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.hf = v3:2.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vdeal(v0.h) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i32>, ptr %a0, align 128 + %v1 = sitofp <64 x i32> %v0 to <64 x half> + store <64 x half> %v1, ptr %a1, align 128 + ret void +} + +; Widen result +define void @s32f16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s32f16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r6 = #1 +; CHECK-NEXT: v1.w = vabs(v0.w) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r6) +; CHECK-NEXT: r3:2 = combine(##255,#8) +; CHECK-NEXT: r4 = #512 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r3) +; CHECK-NEXT: v6 = vsplat(r4) +; CHECK-NEXT: v4.uw = vcl0(v1.uw) +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: r4 = ##-2147483648 +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28 = vsplat(r5) +; CHECK-NEXT: v29 = vsplat(r4) +; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) +; CHECK-NEXT: v31 = vmux(q3,v29,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) +; CHECK-NEXT: v6 = vand(v1,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uw = vlsr(v1.uw,r2) +; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v1.uw = vlsr(v5.uw,r2) +; CHECK-NEXT: v27 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v5.w = vadd(v1.w,v27.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v4.w) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) +; CHECK-NEXT: v2.w = vadd(v2.w,v28.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v5.uw,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vasl(v2.w,r3) +; CHECK-NEXT: v1 = vmux(q2,v30,v1) +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vor(v31,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.qf32 = vadd(v3.sf,v3.sf) +; CHECK-NEXT: v0 = vor(v1,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v3,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v3.sf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.hf = v1:0.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vdeal(v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x i32>, ptr %a0, align 128 + %v1 = sitofp <32 x i32> %v0 to <32 x half> + store <32 x half> %v1, ptr %a1, align 128 + ret void +} + +; s32 -> f32 +; No widening +define void @s32f32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s32f32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r2 = #255 +; CHECK-NEXT: v1.w = vabs(v0.w) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r0) +; CHECK-NEXT: v5 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) +; CHECK-NEXT: v4.uw = vcl0(v1.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v29 = vsplat(r7) +; CHECK-NEXT: r2 = #23 +; CHECK-NEXT: q2 = vcmp.gt(v2.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) +; CHECK-NEXT: v6 = vand(v1,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) +; CHECK-NEXT: q0 = vcmp.eq(v6.w,v2.w) +; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) +; CHECK-NEXT: v6 = vmux(q0,v2,v3) +; CHECK-NEXT: v3 = vmux(q1,v3,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vadd(v5.w,v6.w) +; CHECK-NEXT: v27.w = vsub(v3.w,v4.w) +; CHECK-NEXT: q3 = vcmp.eq(v1.w,v5.w) +; CHECK-NEXT: v4 = vmux(q2,v29,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0) +; CHECK-NEXT: v1.w = vadd(v27.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r2) +; CHECK-NEXT: v3 = vmux(q3,v30,v28) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vor(v4,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v3,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v2,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x i32>, ptr %a0, align 128 + %v1 = sitofp <32 x i32> %v0 to <32 x float> + store <32 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @s32f32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: s32f32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r2 = #255 +; CHECK-NEXT: v1.w = vabs(v0.w) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r0) +; CHECK-NEXT: v5 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) +; CHECK-NEXT: v4.uw = vcl0(v1.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v29 = vsplat(r7) +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) +; CHECK-NEXT: v6 = vand(v1,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) +; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) +; CHECK-NEXT: v6 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vadd(v5.w,v6.w) +; CHECK-NEXT: v27.w = vsub(v2.w,v4.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) +; CHECK-NEXT: v4 = vmux(q3,v29,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v1.w = vadd(v27.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r3) +; CHECK-NEXT: v2 = vmux(q2,v30,v28) +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vor(v4,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v3,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <16 x i32>, ptr %a0, align 128 + %v1 = sitofp <16 x i32> %v0 to <16 x float> + store <16 x float> %v1, ptr %a1, align 128 + ret void +} + + +; u8 -> f16 +; No widening +define void @u8f16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u8f16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##.LCPI13_0 +; CHECK-NEXT: v1:0.uh = vunpack(v2.ub) +; CHECK-NEXT: v2.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: v1 = vmem(r2+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vsplat(r0) +; CHECK-NEXT: r7:6 = combine(#31,#5) +; CHECK-NEXT: r4 = #64 +; CHECK-NEXT: v1 = vdelta(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vsplat(r4) +; CHECK-NEXT: v6.h = vsplat(r7) +; CHECK-NEXT: v4.uh = vcl0(v0.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #10 +; CHECK-NEXT: v19:18.uh = vunpack(v1.ub) +; CHECK-NEXT: v17.h = vadd(v4.h,v3.h) +; CHECK-NEXT: v8 = vxor(v8,v8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v19.h = vasl(v0.h,v17.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uh = vcl0(v18.uh) +; CHECK-NEXT: v9.h = vadd(v19.h,v6.h) +; CHECK-NEXT: v10 = vand(v19,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.uh = vlsr(v19.uh,r6) +; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) +; CHECK-NEXT: q0 = vcmp.eq(v10.h,v8.h) +; CHECK-NEXT: q1 = vcmp.gt(v19.uh,v9.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v21.uh = vlsr(v9.uh,r6) +; CHECK-NEXT: v13 = vmux(q1,v3,v8) +; CHECK-NEXT: v22 = vmux(q0,v8,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20.h = vasl(v18.h,v5.h) +; CHECK-NEXT: v9.h = vadd(v21.h,v22.h) +; CHECK-NEXT: v13.h = vadd(v13.h,v6.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uh = vlsr(v21.uh,r0) +; CHECK-NEXT: v12.h = vadd(v20.h,v6.h) +; CHECK-NEXT: v7 = vand(v20,v7) +; CHECK-NEXT: v2.h = vsub(v13.h,v17.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v14.uh = vlsr(v20.uh,r6) +; CHECK-NEXT: q3 = vcmp.eq(v7.h,v8.h) +; CHECK-NEXT: q2 = vcmp.gt(v20.uh,v12.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v15.uh = vlsr(v12.uh,r6) +; CHECK-NEXT: v24 = vmux(q3,v8,v3) +; CHECK-NEXT: v3 = vmux(q2,v3,v8) +; CHECK-NEXT: q3 = vcmp.eq(v11.h,v21.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uh = vlsr(v9.uh,r0) +; CHECK-NEXT: v3.h = vadd(v3.h,v6.h) +; CHECK-NEXT: v26.h = vadd(v15.h,v24.h) +; CHECK-NEXT: q2 = vcmp.eq(v14.h,v15.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uh = vlsr(v15.uh,r0) +; CHECK-NEXT: v3.h = vsub(v3.h,v5.h) +; CHECK-NEXT: v29 = vmux(q3,v23,v25) +; CHECK-NEXT: q3 = vcmp.eq(v18.h,v8.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uh = vlsr(v26.uh,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vasl(v2.h,r4) +; CHECK-NEXT: v1 = vmux(q2,v28,v27) +; CHECK-NEXT: q2 = vcmp.eq(v0.h,v8.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v2 = vor(v29,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vor(v1,v3) +; CHECK-NEXT: v31 = vmux(q2,v8,v2) +; CHECK-NEXT: vmem(r1+#0) = v31.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v8,v30) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v0.new +; CHECK-NEXT: } + %v0 = load <128 x i8>, ptr %a0, align 128 + %v1 = uitofp <128 x i8> %v0 to <128 x half> + store <128 x half> %v1, ptr %a1, align 128 + ret void +} + +; Widen input +define void @u8f16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u8f16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r6 = #1 +; CHECK-NEXT: r3:2 = combine(#64,#31) +; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vsplat(r6) +; CHECK-NEXT: v4.h = vsplat(r2) +; CHECK-NEXT: r5 = #5 +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r3) +; CHECK-NEXT: r4 = #10 +; CHECK-NEXT: v3.uh = vcl0(v0.uh) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.h = vadd(v3.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vasl(v0.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) +; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) +; CHECK-NEXT: q1 = vcmp.eq(v5.h,v2.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26.uh = vlsr(v7.uh,r5) +; CHECK-NEXT: v27 = vmux(q1,v2,v1) +; CHECK-NEXT: v1 = vmux(q0,v1,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vadd(v1.h,v4.h) +; CHECK-NEXT: v28.h = vadd(v26.h,v27.h) +; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uh = vlsr(v26.uh,r6) +; CHECK-NEXT: v1.h = vsub(v1.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uh = vlsr(v28.uh,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v1.h,r4) +; CHECK-NEXT: v3 = vmux(q2,v30,v29) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v3,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v2,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i8>, ptr %a0, align 128 + %v1 = uitofp <64 x i8> %v0 to <64 x half> + store <64 x half> %v1, ptr %a1, align 128 + ret void +} + + +; u8 -> f32 +; No widening +define void @u8f32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u8f32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##.LCPI15_0,#8) +; CHECK-NEXT: v3:2.uh = vunpack(v1.ub) +; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: r7 = #255 +; CHECK-NEXT: v3 = vmem(r3+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vsplat(r0) +; CHECK-NEXT: v16 = vsplat(r6) +; CHECK-NEXT: v3 = vdelta(v1,v3) +; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10 = vsplat(r7) +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: v5:4.uw = vunpack(v2.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v19 = vsplat(r5) +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v31:30.uh = vunpack(v3.ub) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vcl0(v4.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3:2.uw = vunpack(v30.uh) +; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uw = vcl0(v5.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.uw = vcl0(v2.uw) +; CHECK-NEXT: v7.w = vadd(v7.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uw = vcl0(v3.uw) +; CHECK-NEXT: v11.w = vadd(v11.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v4.w,v6.w) +; CHECK-NEXT: v12.w = vadd(v12.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.w = vasl(v5.w,v7.w) +; CHECK-NEXT: v20 = vand(v8,v16) +; CHECK-NEXT: v17.w = vadd(v8.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v18.w = vasl(v2.w,v11.w) +; CHECK-NEXT: v22 = vand(v9,v16) +; CHECK-NEXT: q1 = vcmp.eq(v20.w,v0.w) +; CHECK-NEXT: v13.w = vadd(v9.w,v10.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v21.w = vasl(v3.w,v12.w) +; CHECK-NEXT: v28.w = vadd(v18.w,v10.w) +; CHECK-NEXT: q2 = vcmp.eq(v22.w,v0.w) +; CHECK-NEXT: v25 = vand(v18,v16) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29 = vmux(q1,v0,v1) +; CHECK-NEXT: v24 = vmux(q2,v0,v1) +; CHECK-NEXT: v16 = vand(v21,v16) +; CHECK-NEXT: q1 = vcmp.eq(v25.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26.uw = vlsr(v28.uw,r2) +; CHECK-NEXT: v10.w = vadd(v21.w,v10.w) +; CHECK-NEXT: q2 = vcmp.gt(v18.uw,v28.uw) +; CHECK-NEXT: q3 = vcmp.eq(v16.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20.uw = vlsr(v18.uw,r2) +; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v13.uw) +; CHECK-NEXT: v18 = vmux(q2,v1,v0) +; CHECK-NEXT: v30 = vmux(q1,v0,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v15.uw = vlsr(v13.uw,r2) +; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v17.uw) +; CHECK-NEXT: v13.w = vadd(v26.w,v30.w) +; CHECK-NEXT: v27 = vmux(q3,v0,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v17.uw,r2) +; CHECK-NEXT: v30 = vmux(q0,v1,v0) +; CHECK-NEXT: q3 = vcmp.gt(v21.uw,v10.uw) +; CHECK-NEXT: v11.w = vsub(v18.w,v11.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: v7.w = vsub(v30.w,v7.w) +; CHECK-NEXT: v22.w = vadd(v23.w,v29.w) +; CHECK-NEXT: v29.w = vadd(v15.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v16.uw = vlsr(v21.uw,r2) +; CHECK-NEXT: v21 = vmux(q2,v1,v0) +; CHECK-NEXT: v31.w = vadd(v25.w,v27.w) +; CHECK-NEXT: v1 = vmux(q3,v1,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v14.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v6.w = vsub(v21.w,v6.w) +; CHECK-NEXT: v7.w = vadd(v7.w,v19.w) +; CHECK-NEXT: v1.w = vsub(v1.w,v12.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: v6.w = vadd(v6.w,v19.w) +; CHECK-NEXT: v11.w = vadd(v11.w,v19.w) +; CHECK-NEXT: v1.w = vadd(v1.w,v19.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v18.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: q1 = vcmp.eq(v20.w,v26.w) +; CHECK-NEXT: q0 = vcmp.eq(v16.w,v25.w) +; CHECK-NEXT: q2 = vcmp.eq(v14.w,v23.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: q3 = vcmp.eq(v9.w,v15.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uw = vlsr(v15.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0) +; CHECK-NEXT: v26 = vmux(q0,v18,v27) +; CHECK-NEXT: v8 = vmux(q3,v8,v24) +; CHECK-NEXT: v27 = vmux(q2,v20,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.w = vasl(v7.w,r4) +; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v13.uw = vlsr(v13.uw,r0) +; CHECK-NEXT: v7 = vor(v8,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v6.w,r4) +; CHECK-NEXT: v25 = vmux(q1,v13,v28) +; CHECK-NEXT: v29 = vmux(q2,v0,v7) +; CHECK-NEXT: vmem(r1+#1) = v29.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v28 = vor(v27,v6) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.w = vasl(v11.w,r4) +; CHECK-NEXT: v1 = vor(v26,v1) +; CHECK-NEXT: v30 = vmux(q3,v0,v28) +; CHECK-NEXT: vmem(r1+#0) = v30.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v25,v11) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w) +; CHECK-NEXT: v1 = vmux(q2,v0,v1) +; CHECK-NEXT: vmem(r1+#3) = v1.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v0,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#2) = v0.new +; CHECK-NEXT: } + %v0 = load <128 x i8>, ptr %a0, align 128 + %v1 = uitofp <128 x i8> %v0 to <128 x float> + store <128 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input #1 +define void @u8f32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u8f32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r7 = #1 +; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v8 = vsplat(r6) +; CHECK-NEXT: r3:2 = combine(##255,#8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v13 = vsplat(r5) +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vcl0(v0.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vcl0(v1.uw) +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.w = vasl(v1.w,v5.w) +; CHECK-NEXT: v11 = vand(v7,v8) +; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) +; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) +; CHECK-NEXT: v8 = vand(v9,v8) +; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: v21 = vmux(q0,v3,v2) +; CHECK-NEXT: q3 = vcmp.eq(v8.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v6.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v22 = vmux(q1,v2,v3) +; CHECK-NEXT: v24 = vmux(q3,v3,v2) +; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vsub(v22.w,v4.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) +; CHECK-NEXT: v10.w = vadd(v19.w,v21.w) +; CHECK-NEXT: v25.w = vadd(v20.w,v24.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) +; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v13.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.uw = vlsr(v19.uw,r7) +; CHECK-NEXT: q3 = vcmp.eq(v23.w,v20.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26.uw = vlsr(v20.uw,r7) +; CHECK-NEXT: v5 = vmux(q2,v27,v11) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v25.uw,r7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vasl(v4.w,r4) +; CHECK-NEXT: v6 = vmux(q3,v6,v26) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v29 = vor(v5,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28 = vor(v6,v2) +; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: vmem(r1+#0) = v31.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q2,v3,v28) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v30.new +; CHECK-NEXT: } + %v0 = load <64 x i8>, ptr %a0, align 128 + %v1 = uitofp <64 x i8> %v0 to <64 x float> + store <64 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input #2 +define void @u8f32_2(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u8f32_2: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r6 = #1 +; CHECK-NEXT: r2 = #255 +; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vsplat(r6) +; CHECK-NEXT: v29 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r3) +; CHECK-NEXT: r5:4 = combine(##159,#8) +; CHECK-NEXT: v5:4.uw = vunpack(v0.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r5) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vcl0(v4.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v4.w,v5.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.w = vadd(v6.w,v29.w) +; CHECK-NEXT: v3 = vand(v6,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) +; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v0.uw) +; CHECK-NEXT: q1 = vcmp.eq(v3.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v0.uw = vlsr(v0.uw,r4) +; CHECK-NEXT: v3 = vmux(q1,v2,v1) +; CHECK-NEXT: v1 = vmux(q0,v1,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) +; CHECK-NEXT: v3.w = vadd(v0.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v6.w,v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v0.uw,r6) +; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.uw = vlsr(v3.uw,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v0 = vmux(q2,v31,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vor(v0,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v2,v0) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x i8>, ptr %a0, align 128 + %v1 = uitofp <32 x i8> %v0 to <32 x float> + store <32 x float> %v1, ptr %a1, align 128 + ret void +} + + +; u16 -> f16 +; No widening +define void @u16f16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u16f16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(#64,#1) +; CHECK-NEXT: r5 = #31 +; CHECK-NEXT: v1.uh = vcl0(v0.uh) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r2) +; CHECK-NEXT: v5.h = vsplat(r3) +; CHECK-NEXT: r4 = #5 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.h = vsplat(r5) +; CHECK-NEXT: r3 = #10 +; CHECK-NEXT: v1.h = vadd(v1.h,v2.h) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vasl(v0.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uh = vlsr(v6.uh,r4) +; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h) +; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v26.uh = vlsr(v7.uh,r4) +; CHECK-NEXT: v27 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) +; CHECK-NEXT: v28.h = vadd(v26.h,v27.h) +; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uh = vlsr(v26.uh,r2) +; CHECK-NEXT: v1.h = vsub(v2.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v1.h,r3) +; CHECK-NEXT: v2 = vmux(q2,v30,v29) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v3,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i16>, ptr %a0, align 128 + %v1 = uitofp <64 x i16> %v0 to <64 x half> + store <64 x half> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @u16f16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u16f16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(#31,#1) +; CHECK-NEXT: r6 = #64 +; CHECK-NEXT: v1.uh = vcl0(v0.uh) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vsplat(r2) +; CHECK-NEXT: v4.h = vsplat(r3) +; CHECK-NEXT: r5 = #5 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.h = vsplat(r6) +; CHECK-NEXT: r4 = #10 +; CHECK-NEXT: v1.h = vadd(v1.h,v2.h) +; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: q3 = vsetq(r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.h = vasl(v0.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) +; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h) +; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5) +; CHECK-NEXT: v5 = vmux(q1,v3,v2) +; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) +; CHECK-NEXT: v28.h = vadd(v7.h,v5.h) +; CHECK-NEXT: q1 = vcmp.eq(v6.h,v7.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) +; CHECK-NEXT: v1.h = vsub(v2.h,v1.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vasl(v1.h,r4) +; CHECK-NEXT: v2 = vmux(q1,v30,v29) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v3,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x i16>, ptr %a0, align 128 + %v1 = uitofp <32 x i16> %v0 to <32 x half> + store <32 x half> %v1, ptr %a1, align 128 + ret void +} + + +; u16 -> f32 +; No widening +define void @u16f32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u16f32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r2 = ##.LCPI20_0 +; CHECK-NEXT: v1:0.uw = vunpack(v2.uh) +; CHECK-NEXT: v2.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: v1 = vmem(r2+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r0) +; CHECK-NEXT: r7:6 = combine(##255,#8) +; CHECK-NEXT: r4 = #512 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v6 = vsplat(r7) +; CHECK-NEXT: v1 = vdelta(v2,v1) +; CHECK-NEXT: v4.uw = vcl0(v0.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v17.w = vadd(v4.w,v3.w) +; CHECK-NEXT: v8 = vxor(v8,v8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v14 = vsplat(r5) +; CHECK-NEXT: v19:18.uw = vunpack(v1.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v19.w = vasl(v0.w,v17.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vcl0(v18.uw) +; CHECK-NEXT: v9.w = vadd(v19.w,v6.w) +; CHECK-NEXT: v10 = vand(v19,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.uw = vlsr(v19.uw,r6) +; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) +; CHECK-NEXT: q0 = vcmp.eq(v10.w,v8.w) +; CHECK-NEXT: q1 = vcmp.gt(v19.uw,v9.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v21.uw = vlsr(v9.uw,r6) +; CHECK-NEXT: v22 = vmux(q0,v8,v3) +; CHECK-NEXT: v12 = vmux(q1,v3,v8) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v20.w = vasl(v18.w,v5.w) +; CHECK-NEXT: v2.w = vsub(v12.w,v17.w) +; CHECK-NEXT: v9.w = vadd(v21.w,v22.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v25.uw = vlsr(v21.uw,r0) +; CHECK-NEXT: v6.w = vadd(v20.w,v6.w) +; CHECK-NEXT: v7 = vand(v20,v7) +; CHECK-NEXT: v2.w = vadd(v2.w,v14.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v13.uw = vlsr(v20.uw,r6) +; CHECK-NEXT: q3 = vcmp.eq(v7.w,v8.w) +; CHECK-NEXT: q2 = vcmp.gt(v20.uw,v6.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v23.uw = vlsr(v6.uw,r6) +; CHECK-NEXT: v7 = vmux(q3,v8,v3) +; CHECK-NEXT: v3 = vmux(q2,v3,v8) +; CHECK-NEXT: q3 = vcmp.eq(v11.w,v21.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.uw = vlsr(v9.uw,r0) +; CHECK-NEXT: v3.w = vsub(v3.w,v5.w) +; CHECK-NEXT: v26.w = vadd(v23.w,v7.w) +; CHECK-NEXT: q2 = vcmp.eq(v13.w,v23.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: v3.w = vadd(v3.w,v14.w) +; CHECK-NEXT: v29 = vmux(q3,v24,v25) +; CHECK-NEXT: q3 = vcmp.eq(v18.w,v8.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v1 = vmux(q2,v28,v27) +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v8.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v3.w,r4) +; CHECK-NEXT: v2 = vor(v29,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30 = vor(v1,v3) +; CHECK-NEXT: v31 = vmux(q2,v8,v2) +; CHECK-NEXT: vmem(r1+#0) = v31.new +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v8,v30) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#1) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i16>, ptr %a0, align 128 + %v1 = uitofp <64 x i16> %v0 to <64 x float> + store <64 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input +define void @u16f32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u16f32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r6 = #1 +; CHECK-NEXT: r2 = #255 +; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1 = vsplat(r6) +; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: r3 = #512 +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5 = vsplat(r3) +; CHECK-NEXT: r5:4 = combine(##159,#8) +; CHECK-NEXT: v3.uw = vcl0(v0.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r5) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) +; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v4.uw) +; CHECK-NEXT: q1 = vcmp.eq(v5.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r4) +; CHECK-NEXT: v5 = vmux(q1,v2,v1) +; CHECK-NEXT: v1 = vmux(q0,v1,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v1.w,v3.w) +; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) +; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6) +; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.uw = vlsr(v29.uw,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v3 = vmux(q2,v3,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v3,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v2,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x i16>, ptr %a0, align 128 + %v1 = uitofp <32 x i16> %v0 to <32 x float> + store <32 x float> %v1, ptr %a1, align 128 + ret void +} + + +; u32 -> f16 +; No widening +define void @u32f16_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u32f16_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: r6 = #255 +; CHECK-NEXT: v1.uw = vcl0(v0.uw) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: r4 = #512 +; CHECK-NEXT: v3.uw = vcl0(v2.uw) +; CHECK-NEXT: v2.cur = vmem(r0+#1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v6 = vsplat(r6) +; CHECK-NEXT: v1.w = vadd(v1.w,v4.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v9 = vxor(v9,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v10 = vsplat(r4) +; CHECK-NEXT: v5.w = vasl(v0.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v8.w = vasl(v2.w,v3.w) +; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) +; CHECK-NEXT: v13 = vand(v5,v7) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vadd(v8.w,v6.w) +; CHECK-NEXT: v7 = vand(v8,v7) +; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v11.uw) +; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3) +; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w) +; CHECK-NEXT: v29 = vmux(q0,v4,v9) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3) +; CHECK-NEXT: v28 = vmux(q1,v9,v4) +; CHECK-NEXT: v30 = vmux(q3,v4,v9) +; CHECK-NEXT: v4 = vmux(q2,v9,v4) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v29.w,v1.w) +; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) +; CHECK-NEXT: v3.w = vsub(v30.w,v3.w) +; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) +; CHECK-NEXT: v1.w = vadd(v1.w,v10.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v9.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2) +; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v5 = vmux(q3,v7,v5) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v9.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r3) +; CHECK-NEXT: v31 = vmux(q1,v4,v6) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.w = vasl(v3.w,r3) +; CHECK-NEXT: v1 = vor(v5,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vor(v31,v3) +; CHECK-NEXT: v1 = vmux(q2,v9,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v9,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.qf32 = vadd(v1.sf,v9.sf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.hf = v3:2.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vdeal(v0.h) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <64 x i32>, ptr %a0, align 128 + %v1 = uitofp <64 x i32> %v0 to <64 x half> + store <64 x half> %v1, ptr %a1, align 128 + ret void +} + +; Widen result +define void @u32f16_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u32f16_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##512,#1) +; CHECK-NEXT: v1.uw = vcl0(v0.uw) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v3 = vsplat(r2) +; CHECK-NEXT: v5 = vsplat(r3) +; CHECK-NEXT: r6 = #255 +; CHECK-NEXT: v2 = vxor(v2,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r5 = #8 +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v1.w = vadd(v1.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) +; CHECK-NEXT: q0 = vcmp.eq(v5.w,v2.w) +; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) +; CHECK-NEXT: v5 = vmux(q0,v2,v3) +; CHECK-NEXT: v3 = vmux(q1,v3,v2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v3.w,v1.w) +; CHECK-NEXT: v30.w = vadd(v4.w,v5.w) +; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v3.uw = vlsr(v30.uw,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v3 = vmux(q1,v3,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.qf32 = vadd(v2.sf,v2.sf) +; CHECK-NEXT: v0 = vor(v3,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v2,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v2.sf) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.hf = v1:0.qf32 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.h = vdeal(v0.h) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <32 x i32>, ptr %a0, align 128 + %v1 = uitofp <32 x i32> %v0 to <32 x half> + store <32 x half> %v1, ptr %a1, align 128 + ret void +} + +; u32 -> f32 +; No widening +define void @u32f32_0(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u32f32_0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##512,#1) +; CHECK-NEXT: v1.uw = vcl0(v0.uw) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r2) +; CHECK-NEXT: v5 = vsplat(r3) +; CHECK-NEXT: r6 = #255 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r5 = #8 +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v1.w = vadd(v1.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) +; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) +; CHECK-NEXT: v5 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v2.w,v1.w) +; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) +; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r3) +; CHECK-NEXT: v2 = vmux(q2,v2,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q3,v3,v31) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: vmem(r1+#0) = v0.new +; CHECK-NEXT: } + %v0 = load <32 x i32>, ptr %a0, align 128 + %v1 = uitofp <32 x i32> %v0 to <32 x float> + store <32 x float> %v1, ptr %a1, align 128 + ret void +} + +; Widen input and result +define void @u32f32_1(ptr %a0, ptr %a1) #0 { +; CHECK-LABEL: u32f32_1: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: r3:2 = combine(##512,#1) +; CHECK-NEXT: v1.uw = vcl0(v0.uw) +; CHECK-NEXT: v0.cur = vmem(r0+#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v2 = vsplat(r2) +; CHECK-NEXT: v5 = vsplat(r3) +; CHECK-NEXT: r6 = #255 +; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4 = vsplat(r6) +; CHECK-NEXT: r5 = #8 +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v1.w = vadd(v1.w,v2.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: r3 = #23 +; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v5 = vand(v6,v5) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) +; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) +; CHECK-NEXT: v5 = vmux(q0,v3,v2) +; CHECK-NEXT: v2 = vmux(q1,v2,v3) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vsub(v2.w,v1.w) +; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) +; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r2 = #64 +; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vasl(v1.w,r3) +; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v2 = vmux(q1,v2,v30) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v31 = vor(v2,v1) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = vmux(q2,v3,v31) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 +; CHECK-NEXT: } + %v0 = load <16 x i32>, ptr %a0, align 128 + %v1 = uitofp <16 x i32> %v0 to <16 x float> + store <16 x float> %v1, ptr %a1, align 128 + ret void +} + + +attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" } + diff --git a/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll b/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll index 726ee8f..699d621 100644 --- a/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll +++ b/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll @@ -1,12 +1,10 @@ ; RUN: llc -march=hexagon < %s | FileCheck %s -; Test that code is generated for the vector sint_to_fp node. The compiler -; asserts with a cannot select message if the node is not expanded. When -; expanded, the generated code is very inefficient, so iwe need to find a more -; efficient code sequence to generate. +; Test that code is generated for the vector sint_to_fp node. -; CHECK: convert_w2sf +; The floor builtin is still scalarized. ; CHECK: call floorf +; CHECK: vmem target triple = "hexagon" -- 2.7.4