From 0daf9b8e41327b1511b2bbc272184ff4fdb8de79 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Feb 2020 20:24:51 -0800 Subject: [PATCH] [X86][LegalizeTypes] Add SoftPromoteHalf support STRICT_FP_EXTEND and STRICT_FP_ROUND This adds a strict version of FP16_TO_FP and FP_TO_FP16 and uses them to implement soft promotion for the half type. This is enough to provide basic support for __fp16 with strictfp. Add the necessary X86 support to use VCVTPS2PH/VCVTPH2PS when F16C is enabled. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 1 + llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 2 + llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 40 ++++++++++ .../CodeGen/SelectionDAG/LegalizeFloatTypes.cpp | 23 +++++- .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 2 + llvm/lib/Target/X86/X86ISelLowering.cpp | 91 +++++++++++++++++----- llvm/lib/Target/X86/X86ISelLowering.h | 3 + llvm/lib/Target/X86/X86InstrAVX512.td | 23 +++--- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 32 +++++--- llvm/lib/Target/X86/X86InstrSSE.td | 16 ++-- 10 files changed, 180 insertions(+), 53 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index b5a434b..afeaf5e 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -633,6 +633,7 @@ namespace ISD { /// form a semi-softened interface for dealing with f16 (as an i16), which /// is often a storage-only type but has native conversions. FP16_TO_FP, FP_TO_FP16, + STRICT_FP16_TO_FP, STRICT_FP_TO_FP16, /// Perform various unary floating-point operations inspired by libm. For /// FPOWI, the result is undefined if if the integer operand doesn't fit diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 28a4653..4ef7b17 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -701,6 +701,8 @@ public: switch (NodeType) { default: return false; + case ISD::STRICT_FP16_TO_FP: + case ISD::STRICT_FP_TO_FP16: #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index d03f765..6fd048a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1009,6 +1009,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; + case ISD::STRICT_FP_TO_FP16: case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_LRINT: @@ -3272,6 +3273,21 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res)); } break; + case ISD::STRICT_FP16_TO_FP: + if (Node->getValueType(0) != MVT::f32) { + // We can extend to types bigger than f32 in two steps without changing + // the result. Since "f16 -> f32" is much more commonly available, give + // CodeGen the option of emitting that before resorting to a libcall. + SDValue Res = + DAG.getNode(ISD::STRICT_FP16_TO_FP, dl, {MVT::f32, MVT::Other}, + {Node->getOperand(0), Node->getOperand(1)}); + Res = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, + {Node->getValueType(0), MVT::Other}, + {Res.getValue(1), Res}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } + break; case ISD::FP_TO_FP16: LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n"); if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { @@ -4234,6 +4250,17 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false)); } break; + case ISD::STRICT_FP16_TO_FP: { + if (Node->getValueType(0) == MVT::f32) { + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = TLI.makeLibCall( + DAG, RTLIB::FPEXT_F16_F32, MVT::f32, Node->getOperand(1), CallOptions, + SDLoc(Node), Node->getOperand(0)); + Results.push_back(Tmp.first); + Results.push_back(Tmp.second); + } + break; + } case ISD::FP_TO_FP16: { RTLIB::Libcall LC = RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16); @@ -4241,6 +4268,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { Results.push_back(ExpandLibCall(LC, Node, false)); break; } + case ISD::STRICT_FP_TO_FP16: { + RTLIB::Libcall LC = + RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unable to expand strict_fp_to_fp16"); + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = + TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1), + CallOptions, SDLoc(Node), Node->getOperand(0)); + Results.push_back(Tmp.first); + Results.push_back(Tmp.second); + break; + } case ISD::FSUB: case ISD::STRICT_FSUB: ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 12fef02..428dc83 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2440,6 +2440,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::EXTRACT_VECTOR_ELT: R = SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(N); break; case ISD::FCOPYSIGN: R = SoftPromoteHalfRes_FCOPYSIGN(N); break; + case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: R = SoftPromoteHalfRes_FP_ROUND(N); break; // Unary FP Operations @@ -2592,6 +2593,14 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) { } SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) { + if (N->isStrictFPOpcode()) { + SDValue Res = + DAG.getNode(ISD::STRICT_FP_TO_FP16, SDLoc(N), {MVT::i16, MVT::Other}, + {N->getOperand(0), N->getOperand(1)}); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; + } + return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, N->getOperand(0)); } @@ -2701,6 +2710,7 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { case ISD::FCOPYSIGN: Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break; + case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: Res = SoftPromoteHalfOp_FP_EXTEND(N); break; case ISD::SELECT_CC: Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break; case ISD::SETCC: Res = SoftPromoteHalfOp_SETCC(N); break; @@ -2741,7 +2751,18 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, } SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) { - SDValue Op = GetSoftPromotedHalf(N->getOperand(0)); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0)); + + if (IsStrict) { + SDValue Res = + DAG.getNode(ISD::STRICT_FP16_TO_FP, SDLoc(N), + {N->getValueType(0), MVT::Other}, {N->getOperand(0), Op}); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + ReplaceValueWith(SDValue(N, 0), Res); + return SDValue(); + } + return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), Op); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 8736ee2..f57852c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -342,7 +342,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::BITCAST: return "bitcast"; case ISD::ADDRSPACECAST: return "addrspacecast"; case ISD::FP16_TO_FP: return "fp16_to_fp"; + case ISD::STRICT_FP16_TO_FP: return "strict_fp16_to_fp"; case ISD::FP_TO_FP16: return "fp_to_fp16"; + case ISD::STRICT_FP_TO_FP16: return "strict_fp_to_fp16"; case ISD::LROUND: return "lround"; case ISD::STRICT_LROUND: return "strict_lround"; case ISD::LLROUND: return "llround"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4321bb7..8cd177a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -374,20 +374,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // If we don't have F16C support, then lower half float conversions // into library calls. if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { - setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom); - setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Custom); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Custom); } else { - setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); - setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f128, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f80, Expand); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f128, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); @@ -20553,29 +20563,64 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { } static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { - assert(Op.getOperand(0).getValueType() == MVT::i16 && - Op.getValueType() == MVT::f32 && "Unexpected VT!"); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && + "Unexpected VT!"); SDLoc dl(Op); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, - DAG.getConstant(0, dl, MVT::v8i16), - Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); - Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, - DAG.getIntPtrConstant(0, dl)); + DAG.getConstant(0, dl, MVT::v8i16), Src, + DAG.getIntPtrConstant(0, dl)); + + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other}, + {Op.getOperand(0), Res}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + + return Res; } static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) { - assert(Op.getOperand(0).getValueType() == MVT::f32 && - Op.getValueType() == MVT::i16 && "Unexpected VT!"); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && + "Unexpected VT!"); SDLoc dl(Op); - SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, - Op.getOperand(0)); - Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, - DAG.getTargetConstant(4, dl, MVT::i32)); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res, - DAG.getIntPtrConstant(0, dl)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32, + DAG.getConstantFP(0, dl, MVT::v4f32), Src, + DAG.getIntPtrConstant(0, dl)); + Res = DAG.getNode( + X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, + {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)}); + Chain = Res.getValue(1); + } else { + // FIXME: Should we use zeros for upper elements for non-strict? + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src); + Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, + DAG.getTargetConstant(4, dl, MVT::i32)); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + + return Res; } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -28821,8 +28866,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); - case ISD::FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG); - case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); + case ISD::FP16_TO_FP: + case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG); + case ISD::FP_TO_FP16: + case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: @@ -30162,8 +30209,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SCALAR_UINT_TO_FP) NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) NODE_NAME_CASE(CVTPS2PH) + NODE_NAME_CASE(STRICT_CVTPS2PH) NODE_NAME_CASE(MCVTPS2PH) NODE_NAME_CASE(CVTPH2PS) + NODE_NAME_CASE(STRICT_CVTPH2PS) NODE_NAME_CASE(CVTPH2PS_SAE) NODE_NAME_CASE(CVTP2SI) NODE_NAME_CASE(CVTP2UI) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index cf88ec8..8c085c7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -627,6 +627,9 @@ namespace llvm { // Strict FMA nodes. STRICT_FNMADD, STRICT_FMSUB, STRICT_FNMSUB, + // Conversions between float and half-float. + STRICT_CVTPS2PH, STRICT_CVTPH2PS, + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9ce6957..f07f568 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8568,14 +8568,15 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_cvtph2ps { - defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), + defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", + (X86any_cvtph2ps (_src.VT _src.RC:$src)), (X86cvtph2ps (_src.VT _src.RC:$src))>, T8PD, Sched<[sched]>; - defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), + defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), "vcvtph2ps", "$src", "$src", - (X86cvtph2ps (_src.VT - (ld_frag addr:$src)))>, + (X86any_cvtph2ps (_src.VT (ld_frag addr:$src))), + (X86cvtph2ps (_src.VT (ld_frag addr:$src)))>, T8PD, Sched<[sched.Folded]>; } @@ -8604,9 +8605,9 @@ let Predicates = [HasVLX] in { EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSZ128rm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VCVTPH2PSZ128rm addr:$src)>; } @@ -8618,7 +8619,7 @@ let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _dest.RC:$dst, - (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, + (X86any_cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, Sched<[RR]>; let Constraints = "$src0 = $dst" in def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), @@ -8673,16 +8674,16 @@ let Predicates = [HasAVX512] in { } def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), + (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), + (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst), + def : Pat<(store (v8i16 (X86any_cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst), (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>; - def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst), + def : Pat<(store (v16i16 (X86any_cvtps2ph VR512:$src1, timm:$src2)), addr:$dst), (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>; } diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 5c5ddae..3fc63e8 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -721,19 +721,27 @@ def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>; def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>; def X86mcvttp2ui : SDNode<"X86ISD::MCVTTP2UI", SDTMFloatToInt>; +def SDTcvtph2ps : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]>; +def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTcvtph2ps>; +def X86strict_cvtph2ps : SDNode<"X86ISD::STRICT_CVTPH2PS", SDTcvtph2ps, + [SDNPHasChain]>; +def X86any_cvtph2ps : PatFrags<(ops node:$src), + [(X86strict_cvtph2ps node:$src), + (X86cvtph2ps node:$src)]>; + +def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", SDTcvtph2ps>; + +def SDTcvtps2ph : SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisVT<2, i32>]>; +def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTcvtps2ph>; +def X86strict_cvtps2ph : SDNode<"X86ISD::STRICT_CVTPS2PH", SDTcvtps2ph, + [SDNPHasChain]>; +def X86any_cvtps2ph : PatFrags<(ops node:$src1, node:$src2), + [(X86strict_cvtps2ph node:$src1, node:$src2), + (X86cvtps2ph node:$src1, node:$src2)]>; -def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, i16>]> >; - -def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, i16>]> >; - -def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, - SDTCVecEltisVT<1, f32>, - SDTCisVT<2, i32>]> >; def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>, SDTCVecEltisVT<1, f32>, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index b519e4a..54cb78d 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7337,12 +7337,12 @@ multiclass f16c_ph2ps { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, + [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, T8PD, VEX, Sched<[sched]>; let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>, + [(set RC:$dst, (X86any_cvtph2ps (loadv8i16 addr:$src)))]>, T8PD, VEX, Sched<[sched.Folded]>; } @@ -7351,7 +7351,7 @@ multiclass f16c_ps2ph, + [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, TAPD, VEX, Sched<[RR]>; let hasSideEffects = 0, mayStore = 1 in def mr : Ii8<0x1D, MRMDestMem, (outs), @@ -7369,21 +7369,21 @@ let Predicates = [HasF16C, NoVLX] in { WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 + def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), + (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), + (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), + def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; } -- 2.7.4