From b4e7893ba8db8ceead3ae1c113598e5a107deaa2 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 10 May 2018 15:40:49 +0000
Subject: [PATCH] [x86] fix fmaxnum/fminnum with nnan

With nnan, there's no need for the masked merge / blend
sequence (that probably costs much more than the min/max
instruction).

Somewhere between clang 5.0 and 6.0, we started producing
these intrinsics for fmax()/fmin() in C source instead of
libcalls or fcmp/select. The backend wasn't prepared for
that, so we regressed perf in those cases.

Note: it's possible that other targets have similar problems
as seen here.

Noticed while investigating PR37403 and related bugs:
https://bugs.llvm.org/show_bug.cgi?id=37403

The IR FMF propagation cases still don't work. There's
a proposal that might fix those cases in D46563.

llvm-svn: 331992
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++---------
 llvm/test/CodeGen/X86/fmaxnum.ll        | 30 +++++++-----------------------
 llvm/test/CodeGen/X86/fminnum.ll        | 30 +++++++-----------------------
 3 files changed, 27 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9825208..1c72ab7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36364,8 +36364,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.useSoftFloat())
     return SDValue();
 
-  // TODO: Check for global or instruction-level "nnan". In that case, we
-  //       should be able to lower to FMAX/FMIN alone.
   // TODO: If an operand is already known to be a NaN or not a NaN, this
   //       should be an optional swap and FMAX/FMIN.
 
@@ -36375,14 +36373,21 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
     return SDValue();
 
-  // This takes at least 3 instructions, so favor a library call when operating
-  // on a scalar and minimizing code size.
-  if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
-    return SDValue();
-
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
+  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+  // If we don't have to respect NaN inputs, this is a direct translation to x86
+  // min/max instructions.
+  if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+  // If we have to respect NaN inputs, this takes at least 3 instructions.
+  // Favor a library call when operating on a scalar and minimizing code size.
+  if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+    return SDValue();
+
   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
       DAG.getDataLayout(), *DAG.getContext(), VT);
 
@@ -36405,9 +36410,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   // use those instructions for fmaxnum by selecting away a NaN input.
 
   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
-  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
-  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll
index c6c9fa7..1187609 100644
--- a/llvm/test/CodeGen/X86/fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/fmaxnum.ll
@@ -285,7 +285,7 @@ define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y)
   ret <8 x double> %z
 }
 
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
 
 define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
 ; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
@@ -333,49 +333,33 @@ define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %
   ret <4 x float> %r
 }
 
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
 
 define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {
 ; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    cmpunordss %xmm0, %xmm2
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    andps %xmm1, %xmm3
-; SSE-NEXT:    maxss %xmm0, %xmm1
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm3, %xmm2
-; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    maxss %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = tail call float @llvm.maxnum.f32(float %a, float %b)
   ret float %r
 }
 
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
 
 define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 {
 ; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm1, %xmm2
-; SSE-NEXT:    maxpd %xmm0, %xmm2
-; SSE-NEXT:    cmpunordpd %xmm0, %xmm0
-; SSE-NEXT:    andpd %xmm0, %xmm1
-; SSE-NEXT:    andnpd %xmm2, %xmm0
-; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    maxpd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmaxpd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
   ret <2 x double> %r
diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll
index f520503..a81e35c 100644
--- a/llvm/test/CodeGen/X86/fminnum.ll
+++ b/llvm/test/CodeGen/X86/fminnum.ll
@@ -277,7 +277,7 @@ define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y)
   ret <8 x double> %z
 }
 
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
 
 define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
 ; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
@@ -325,49 +325,33 @@ define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x doubl
   ret <2 x double> %r
 }
 
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
 
 define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 {
 ; SSE-LABEL: minnum_intrinsic_nnan_attr_f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm0, %xmm2
-; SSE-NEXT:    cmpunordsd %xmm0, %xmm2
-; SSE-NEXT:    movapd %xmm2, %xmm3
-; SSE-NEXT:    andpd %xmm1, %xmm3
-; SSE-NEXT:    minsd %xmm0, %xmm1
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm3, %xmm2
-; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    minsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: minnum_intrinsic_nnan_attr_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = tail call double @llvm.minnum.f64(double %a, double %b)
   ret double %r
 }
 
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
 
 define <4 x float> @minnum_intrinsic_nnan_attr_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; SSE-LABEL: minnum_intrinsic_nnan_attr_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm1, %xmm2
-; SSE-NEXT:    minps %xmm0, %xmm2
-; SSE-NEXT:    cmpunordps %xmm0, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm1
-; SSE-NEXT:    andnps %xmm2, %xmm0
-; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    minps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: minnum_intrinsic_nnan_attr_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vminps %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %r
-- 
2.7.4