From d04db4825a4d5f7fe298811a9a1644ae87748c6c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 12 Apr 2020 10:26:43 -0400
Subject: [PATCH] [x86] use vector instructions to lower FP->int->FP casts

As discussed in PR36617:
https://bugs.llvm.org/show_bug.cgi?id=36617#c13
...we can avoid the likely slow round-trip from XMM to GPR to XMM
by using the vector versions of the convert instructions.

Based on experimental results from recent Intel/AMD chips, we don't
need to worry about triggering denorm stalls while operating on
garbage data in the high lanes with convert instructions, so this is
expected to always be as good or better perf than the scalar
instruction equivalent. FP exceptions are also not a concern because
strict code should not be using the regular SDAG opcodes.

Differential Revision: https://reviews.llvm.org/D77895
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 42 +++++++++++++++++++++++++++++++++
 llvm/test/CodeGen/X86/ftrunc.ll         | 14 +++++------
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a474679..623c1df 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19141,6 +19141,45 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
                      DAG.getIntPtrConstant(0, DL));
 }
 
+/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
+/// try to vectorize the cast ops. This will avoid an expensive round-trip
+/// between XMM and GPR.
+static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  // TODO: Allow FP_TO_UINT.
+  SDValue CastToInt = CastToFP.getOperand(0);
+  MVT VT = CastToFP.getSimpleValueType();
+  if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
+    return SDValue();
+
+  MVT IntVT = CastToInt.getSimpleValueType();
+  SDValue X = CastToInt.getOperand(0);
+  // TODO: Allow size-changing from source to dest (double -> i32 -> float)
+  if (X.getSimpleValueType() != VT ||
+      VT.getSizeInBits() != IntVT.getSizeInBits())
+    return SDValue();
+
+  // See if we have a 128-bit vector cast op for this type of cast.
+  unsigned NumEltsInXMM = 128 / VT.getScalarSizeInBits();
+  MVT Vec128VT = MVT::getVectorVT(VT, NumEltsInXMM);
+  MVT Int128VT = MVT::getVectorVT(IntVT, NumEltsInXMM);
+  if (!useVectorCast(CastToFP.getOpcode(), Int128VT, Vec128VT, Subtarget))
+    return SDValue();
+
+  // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
+  //
+  // We are not defining the high elements (for example, zero them) because
+  // that could nullify any performance advantage that we hoped to gain from
+  // this vector op hack. We do not expect any adverse effects (like denorm
+  // penalties) with cast ops.
+  SDLoc DL(CastToFP);
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+  SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, X);
+  SDValue VCastToInt = DAG.getNode(ISD::FP_TO_SINT, DL, Int128VT, VecX);
+  SDValue VCastToFP = DAG.getNode(ISD::SINT_TO_FP, DL, Vec128VT, VCastToInt);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
+}
+
 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(Op);
@@ -19243,6 +19282,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
+  if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
+    return R;
+
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
       // Note: Since v2f64 is a legal type. We don't need to zero extend the
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 7c58f34..10e9b9c 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -223,15 +223,14 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
 define float @trunc_signed_f32_no_fast_math(float %x) {
 ; SSE-LABEL: trunc_signed_f32_no_fast_math:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    cvttss2si %xmm0, %eax
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_signed_f32_no_fast_math:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vcvttss2si %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2ss %eax, %xmm1, %xmm0
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; AVX1-NEXT:    retq
   %i = fptosi float %x to i32
   %r = sitofp i32 %i to float
@@ -241,9 +240,8 @@ define float @trunc_signed_f32_no_fast_math(float %x) {
 define float @trunc_signed_f32(float %x) #0 {
 ; SSE2-LABEL: trunc_signed_f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    cvttss2si %xmm0, %eax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_signed_f32:
-- 
2.7.4