From b4e7893ba8db8ceead3ae1c113598e5a107deaa2 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 10 May 2018 15:40:49 +0000 Subject: [PATCH] [x86] fix fmaxnum/fminnum with nnan With nnan, there's no need for the masked merge / blend sequence (that probably costs much more than the min/max instruction). Somewhere between clang 5.0 and 6.0, we started producing these intrinsics for fmax()/fmin() in C source instead of libcalls or fcmp/select. The backend wasn't prepared for that, so we regressed perf in those cases. Note: it's possible that other targets have similar problems as seen here. Noticed while investigating PR37403 and related bugs: https://bugs.llvm.org/show_bug.cgi?id=37403 The IR FMF propagation cases still don't work. There's a proposal that might fix those cases in D46563. llvm-svn: 331992 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++++--------- llvm/test/CodeGen/X86/fmaxnum.ll | 30 +++++++----------------------- llvm/test/CodeGen/X86/fminnum.ll | 30 +++++++----------------------- 3 files changed, 27 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9825208..1c72ab7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36364,8 +36364,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (Subtarget.useSoftFloat()) return SDValue(); - // TODO: Check for global or instruction-level "nnan". In that case, we - // should be able to lower to FMAX/FMIN alone. // TODO: If an operand is already known to be a NaN or not a NaN, this // should be an optional swap and FMAX/FMIN. @@ -36375,14 +36373,21 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) return SDValue(); - // This takes at least 3 instructions, so favor a library call when operating - // on a scalar and minimizing code size. - if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) - return SDValue(); - SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + + // If we don't have to respect NaN inputs, this is a direct translation to x86 + // min/max instructions. + if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) + return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + + // If we have to respect NaN inputs, this takes at least 3 instructions. + // Favor a library call when operating on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) + return SDValue(); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( DAG.getDataLayout(), *DAG.getContext(), VT); @@ -36405,9 +36410,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, // use those instructions for fmaxnum by selecting away a NaN input. // If either operand is NaN, the 2nd source operand (Op0) is passed through. - auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); - SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO); // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll index c6c9fa7..1187609 100644 --- a/llvm/test/CodeGen/X86/fmaxnum.ll +++ b/llvm/test/CodeGen/X86/fmaxnum.ll @@ -285,7 +285,7 @@ define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) ret <8 x double> %z } -; FIXME: The IR-level FMF should propagate to the node. +; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend. define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) { ; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64: @@ -333,49 +333,33 @@ define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> % ret <4 x float> %r } -; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold. +; Current (but legacy someday): a function-level attribute should also enable the fold. define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 { ; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpunordss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: maxss %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: maxss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32: ; AVX: # %bb.0: -; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call float @llvm.maxnum.f32(float %a, float %b) ret float %r } -; FIXME: Make sure vectors work too. +; Make sure vectors work too. define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 { ; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: maxpd %xmm0, %xmm2 -; SSE-NEXT: cmpunordpd %xmm0, %xmm0 -; SSE-NEXT: andpd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm2, %xmm0 -; SSE-NEXT: orpd %xmm1, %xmm0 +; SSE-NEXT: maxpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) ret <2 x double> %r diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll index f520503..a81e35c 100644 --- a/llvm/test/CodeGen/X86/fminnum.ll +++ b/llvm/test/CodeGen/X86/fminnum.ll @@ -277,7 +277,7 @@ define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) ret <8 x double> %z } -; FIXME: The IR-level FMF should propagate to the node. +; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend. define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) { ; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32: @@ -325,49 +325,33 @@ define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x doubl ret <2 x double> %r } -; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold. +; Current (but legacy someday): a function-level attribute should also enable the fold. define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 { ; SSE-LABEL: minnum_intrinsic_nnan_attr_f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpunordsd %xmm0, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: andpd %xmm1, %xmm3 -; SSE-NEXT: minsd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: minnum_intrinsic_nnan_attr_f64: ; AVX: # %bb.0: -; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call double @llvm.minnum.f64(double %a, double %b) ret double %r } -; FIXME: Make sure vectors work too. +; Make sure vectors work too. define <4 x float> @minnum_intrinsic_nnan_attr_v4f32(<4 x float> %a, <4 x float> %b) #0 { ; SSE-LABEL: minnum_intrinsic_nnan_attr_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: minps %xmm0, %xmm2 -; SSE-NEXT: cmpunordps %xmm0, %xmm0 -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: minps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: minnum_intrinsic_nnan_attr_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vminps %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) ret <4 x float> %r -- 2.7.4