From c28e09c8d1ca3a82467e71af89f1ac7b71458fca Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 16 Jul 2023 14:51:17 -0400 Subject: [PATCH] AMDGPU: Preserve flags in fdiv_fast lowering We were dropping the flags and thus blocking contract into potential fadd users. GlobalISel was already preserving the flags here. https://reviews.llvm.org/D155443 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 +++++++------- llvm/test/CodeGen/AMDGPU/fdiv.ll | 6 ++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index fe20b83..d9ac5b1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9271,11 +9271,12 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { + SDNodeFlags Flags = Op->getFlags(); SDLoc SL(Op); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); const APFloat K0Val(0x1p+96f); const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); @@ -9290,17 +9291,16 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags); - // TODO: Should this propagate fast-math-flags? - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags); // rcp does not support denormals. - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags); } // Returns immediate value for setting the F32 denorm mode when using the diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 0a126fc..d6a2808 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -3828,8 +3828,7 @@ define float @v_fdiv_f32_daz_25ulp_contractable_user(float %x, float %y, float % ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_fma_f32 v0, v3, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_daz_25ulp_contractable_user: @@ -5087,8 +5086,7 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user(float %x, float ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_fma_f32 v0, v3, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user: -- 2.7.4