From 18b57da49170332b0d141b1622b8621aeac87a39 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 10 Dec 2016 19:35:39 +0000 Subject: [PATCH] [AVX-512] Add support for lowering (v2i64 (fp_to_sint (v2f32))) to vcvttps2uqq when AVX512DQ and AVX512VL are available. llvm-svn: 289335 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 28 ++++++++-- llvm/lib/Target/X86/X86ISelLowering.h | 3 +- llvm/test/CodeGen/X86/vec_fp_to_int.ll | 96 ++++++++++++++++++++++++++------- 3 files changed, 103 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b690554..ac3e44d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1265,6 +1265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasVLX()) { // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); } } if (Subtarget.hasVLX()) { @@ -15233,11 +15235,28 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, + const X86Subtarget &Subtarget, SelectionDAG &DAG) const { - assert(!Op.getSimpleValueType().isVector()); - bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; + MVT VT = Op.getSimpleValueType(); + + if (VT.isVector()) { + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); + SDValue Src = Op.getOperand(0); + SDLoc dl(Op); + if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { + return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, + dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32))); + } + + return SDValue(); + } + + assert(!VT.isVector()); + std::pair Vals = FP_TO_INTHelper(Op, DAG, IsSigned, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -15247,8 +15266,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, - MachinePointerInfo()); + return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo()); // The node is the result. return FIST; @@ -22780,7 +22798,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 962d940..7ba9832 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1137,7 +1137,8 @@ namespace llvm { SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 4ae95ba..cf7ca6b 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -886,15 +886,50 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f32_to_2i64: -; AVX: # BB#0: -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; VEX-LABEL: fptosi_2f32_to_2i64: +; VEX: # BB#0: +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f32_to_2i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f32_to_2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f32_to_2i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm1 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64: +; AVX512VLDQ: # BB#0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %cvt = fptosi <2 x float> %shuf to <2 x i64> ret <2 x i64> %cvt @@ -1384,15 +1419,40 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; VEX-NEXT: retq ; -; AVX512-LABEL: fptoui_2f32_to_2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vcvttss2usi %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512-NEXT: vcvttss2usi %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: retq +; AVX512F-LABEL: fptoui_2f32_to_2i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i64: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm1 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax +; AVX512DQ-NEXT: vmovq %rax, %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64: +; AVX512VLDQ: # BB#0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %cvt = fptoui <2 x float> %shuf to <2 x i64> ret <2 x i64> %cvt -- 2.7.4