if (Subtarget.useSoftFloat())
return SDValue();
- // TODO: Check for global or instruction-level "nnan". In that case, we
- // should be able to lower to FMAX/FMIN alone.
// TODO: If an operand is already known to be a NaN or not a NaN, this
// should be an optional swap and FMAX/FMIN.
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
return SDValue();
- // This takes at least 3 instructions, so favor a library call when operating
- // on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
- return SDValue();
-
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+ // If we don't have to respect NaN inputs, this is a direct translation to x86
+ // min/max instructions.
+ if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+ // If we have to respect NaN inputs, this takes at least 3 instructions.
+ // Favor a library call when operating on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+ return SDValue();
+
EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
DAG.getDataLayout(), *DAG.getContext(), VT);
// use those instructions for fmaxnum by selecting away a NaN input.
// If either operand is NaN, the 2nd source operand (Op0) is passed through.
- auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
- SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
ret <8 x double> %z
}
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
ret <4 x float> %r
}
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {
; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpunordss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: maxss %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: maxss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call float @llvm.maxnum.f32(float %a, float %b)
ret float %r
}
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 {
; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64:
; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm1, %xmm2
-; SSE-NEXT: maxpd %xmm0, %xmm2
-; SSE-NEXT: cmpunordpd %xmm0, %xmm0
-; SSE-NEXT: andpd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm2, %xmm0
-; SSE-NEXT: orpd %xmm1, %xmm0
+; SSE-NEXT: maxpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %r
ret <8 x double> %z
}
-; FIXME: The IR-level FMF should propagate to the node.
+; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
ret <2 x double> %r
}
-; FIXME: Current (but legacy someday): a function-level attribute should also enable the fold.
+; Current (but legacy someday): a function-level attribute should also enable the fold.
define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 {
; SSE-LABEL: minnum_intrinsic_nnan_attr_f64:
; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: cmpunordsd %xmm0, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm1, %xmm3
-; SSE-NEXT: minsd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm1, %xmm2
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: minsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: minnum_intrinsic_nnan_attr_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call double @llvm.minnum.f64(double %a, double %b)
ret double %r
}
-; FIXME: Make sure vectors work too.
+; Make sure vectors work too.
define <4 x float> @minnum_intrinsic_nnan_attr_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; SSE-LABEL: minnum_intrinsic_nnan_attr_v4f32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm1, %xmm2
-; SSE-NEXT: minps %xmm0, %xmm2
-; SSE-NEXT: cmpunordps %xmm0, %xmm0
-; SSE-NEXT: andps %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm2, %xmm0
-; SSE-NEXT: orps %xmm1, %xmm0
+; SSE-NEXT: minps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: minnum_intrinsic_nnan_attr_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vminps %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %r