[x86] fix fmaxnum/fminnum with nnan

author Sanjay Patel <spatel@rotateright.com>

Thu, 10 May 2018 15:40:49 +0000 (15:40 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Thu, 10 May 2018 15:40:49 +0000 (15:40 +0000)
author Sanjay Patel <spatel@rotateright.com>
Thu, 10 May 2018 15:40:49 +0000 (15:40 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Thu, 10 May 2018 15:40:49 +0000 (15:40 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 9825208..1c72ab7 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36364,8 +36364,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
    if (Subtarget.useSoftFloat())
      return SDValue();
  
-  // TODO: Check for global or instruction-level "nnan". In that case, we
-  //       should be able to lower to FMAX/FMIN alone.
    // TODO: If an operand is already known to be a NaN or not a NaN, this
    //       should be an optional swap and FMAX/FMIN.
  
@@ -36375,14 +36373,21 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
          (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
      return SDValue();
  
-  // This takes at least 3 instructions, so favor a library call when operating
-  // on a scalar and minimizing code size.
-  if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
-    return SDValue();
-
    SDValue Op0 = N->getOperand(0);
    SDValue Op1 = N->getOperand(1);
    SDLoc DL(N);
+  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+  // If we don't have to respect NaN inputs, this is a direct translation to x86
+  // min/max instructions.
+  if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+  // If we have to respect NaN inputs, this takes at least 3 instructions.
+  // Favor a library call when operating on a scalar and minimizing code size.
+  if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+    return SDValue();
+
    EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
        DAG.getDataLayout(), *DAG.getContext(), VT);
  
@@ -36405,9 +36410,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
    // use those instructions for fmaxnum by selecting away a NaN input.
  
    // If either operand is NaN, the 2nd source operand (Op0) is passed through.
-  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
    SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
-  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
  
    // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
    // are NaN, the NaN value of Op1 is the result.
diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll

index c6c9fa7..1187609 100644 (file)
--- a/llvm/test/CodeGen/X86/fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/fmaxnum.ll
@@ -285,7 +285,7 @@ define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y)
    ret <8 x double> %z
  }
  
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
  
  define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
  ; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
@@ -333,49 +333,33 @@ define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %
    ret <4 x float> %r
  }
  
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
  
  define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {
  ; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    cmpunordss %xmm0, %xmm2
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    andps %xmm1, %xmm3
-; SSE-NEXT:    maxss %xmm0, %xmm1
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm3, %xmm2
-; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    maxss %xmm1, %xmm0
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %r = tail call float @llvm.maxnum.f32(float %a, float %b)
    ret float %r
  }
  
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
  
  define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 {
  ; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm1, %xmm2
-; SSE-NEXT:    maxpd %xmm0, %xmm2
-; SSE-NEXT:    cmpunordpd %xmm0, %xmm0
-; SSE-NEXT:    andpd %xmm0, %xmm1
-; SSE-NEXT:    andnpd %xmm2, %xmm0
-; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    maxpd %xmm1, %xmm0
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    vmaxpd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
    ret <2 x double> %r
diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll

index f520503..a81e35c 100644 (file)
--- a/llvm/test/CodeGen/X86/fminnum.ll
+++ b/llvm/test/CodeGen/X86/fminnum.ll
@@ -277,7 +277,7 @@ define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y)
    ret <8 x double> %z
  }
  
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
  
  define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
  ; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
@@ -325,49 +325,33 @@ define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x doubl
    ret <2 x double> %r
  }
  
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
  
  define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 {
  ; SSE-LABEL: minnum_intrinsic_nnan_attr_f64:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm0, %xmm2
-; SSE-NEXT:    cmpunordsd %xmm0, %xmm2
-; SSE-NEXT:    movapd %xmm2, %xmm3
-; SSE-NEXT:    andpd %xmm1, %xmm3
-; SSE-NEXT:    minsd %xmm0, %xmm1
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm3, %xmm2
-; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    minsd %xmm1, %xmm0
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: minnum_intrinsic_nnan_attr_f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %r = tail call double @llvm.minnum.f64(double %a, double %b)
    ret double %r
  }
  
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
  
  define <4 x float> @minnum_intrinsic_nnan_attr_v4f32(<4 x float> %a, <4 x float> %b) #0 {
  ; SSE-LABEL: minnum_intrinsic_nnan_attr_v4f32:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm1, %xmm2
-; SSE-NEXT:    minps %xmm0, %xmm2
-; SSE-NEXT:    cmpunordps %xmm0, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm1
-; SSE-NEXT:    andnps %xmm2, %xmm0
-; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    minps %xmm1, %xmm0
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: minnum_intrinsic_nnan_attr_v4f32:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    vminps %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %r = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
    ret <4 x float> %r
author	Sanjay Patel <spatel@rotateright.com>
	Thu, 10 May 2018 15:40:49 +0000 (15:40 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Thu, 10 May 2018 15:40:49 +0000 (15:40 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/fmaxnum.ll		patch \| blob \| history
llvm/test/CodeGen/X86/fminnum.ll		patch \| blob \| history