From 8ec8ad868d9b970245e827b14306fbd11d11a9b2 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 24 Jul 2020 11:41:57 +0100 Subject: [PATCH] [AMDGPU] Use fma for lowering frem This gives shorter f64 code and perhaps better accuracy. Differential Revision: https://reviews.llvm.org/D84516 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 ++-- llvm/test/CodeGen/AMDGPU/frem.ll | 84 ++++++++++++--------------- 2 files changed, 41 insertions(+), 53 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1f5d83d..a697df55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2079,7 +2079,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, return DAG.getMergeValues(Res, DL); } -// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); EVT VT = Op.getValueType(); @@ -2089,10 +2089,10 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { // TODO: Should this propagate fast-math-flags? SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); - SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); - - return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div); + SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc); + // TODO: For f32 use FMAD instead if !hasFastFMA32? + return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X); } SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index aef979f..0414384 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -36,7 +36,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_mad_f32 v0, -v2, v1, v0 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -72,7 +72,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_mad_f32 v0, -v2, v1, v0 +; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -106,7 +106,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f32_e32 v3, v3 -; VI-NEXT: v_mad_f32 v2, -v3, v2, v4 +; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm float addrspace(1)* %in2) #0 { @@ -140,7 +140,7 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_mad_f32 v0, -v2, v1, v0 +; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -165,7 +165,7 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_mad_f32 v0, -v2, v1, v0 +; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -188,7 +188,7 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3 ; VI-NEXT: v_trunc_f32_e32 v3, v3 -; VI-NEXT: v_mad_f32 v2, -v3, v2, v4 +; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm float addrspace(1)* %in2) #1 { @@ -251,8 +251,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] -; SI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] -; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -287,8 +286,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -319,8 +317,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm double addrspace(1)* %in2) #0 { @@ -368,8 +365,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] -; SI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] -; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; @@ -394,8 +390,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -416,8 +411,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm double addrspace(1)* %in2) #1 { @@ -463,7 +457,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v4, v4 -; SI-NEXT: v_mad_f32 v1, -v4, v3, v1 +; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 @@ -478,7 +472,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 -; SI-NEXT: v_mad_f32 v0, -v3, v2, v0 +; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -516,7 +510,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_mad_f32 v1, -v4, v3, v1 +; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 @@ -531,7 +525,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_mad_f32 v0, -v3, v2, v0 +; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -567,7 +561,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 ; VI-NEXT: v_trunc_f32_e32 v6, v6 -; VI-NEXT: v_mad_f32 v3, -v6, v5, v3 +; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 ; VI-NEXT: v_rcp_f32_e32 v7, v6 @@ -582,7 +576,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 ; VI-NEXT: v_trunc_f32_e32 v5, v5 -; VI-NEXT: v_mad_f32 v2, -v5, v4, v2 +; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm <2 x float> addrspace(1)* %in2) #0 { @@ -629,7 +623,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; SI-NEXT: v_trunc_f32_e32 v8, v8 -; SI-NEXT: v_mad_f32 v3, -v8, v7, v3 +; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v9, v8 @@ -644,7 +638,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v7, v7 -; SI-NEXT: v_mad_f32 v2, -v7, v6, v2 +; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; SI-NEXT: v_rcp_f32_e32 v8, v7 @@ -659,7 +653,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; SI-NEXT: v_trunc_f32_e32 v6, v6 -; SI-NEXT: v_mad_f32 v1, -v6, v5, v1 +; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; SI-NEXT: v_rcp_f32_e32 v7, v6 @@ -674,7 +668,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; SI-NEXT: v_trunc_f32_e32 v5, v5 -; SI-NEXT: v_mad_f32 v0, -v5, v4, v0 +; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -712,7 +706,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; CI-NEXT: v_trunc_f32_e32 v8, v8 -; CI-NEXT: v_mad_f32 v3, -v8, v7, v3 +; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; CI-NEXT: v_rcp_f32_e32 v9, v8 @@ -727,7 +721,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v7, v7 -; CI-NEXT: v_mad_f32 v2, -v7, v6, v2 +; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; CI-NEXT: v_rcp_f32_e32 v8, v7 @@ -742,7 +736,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; CI-NEXT: v_trunc_f32_e32 v6, v6 -; CI-NEXT: v_mad_f32 v1, -v6, v5, v1 +; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; CI-NEXT: v_rcp_f32_e32 v7, v6 @@ -757,7 +751,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; CI-NEXT: v_trunc_f32_e32 v5, v5 -; CI-NEXT: v_mad_f32 v0, -v5, v4, v0 +; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -793,7 +787,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 ; VI-NEXT: v_trunc_f32_e32 v10, v10 -; VI-NEXT: v_mad_f32 v3, -v10, v7, v3 +; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; VI-NEXT: v_rcp_f32_e32 v11, v10 @@ -808,7 +802,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; VI-NEXT: v_trunc_f32_e32 v7, v7 -; VI-NEXT: v_mad_f32 v2, -v7, v6, v2 +; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; VI-NEXT: v_rcp_f32_e32 v10, v7 @@ -823,7 +817,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; VI-NEXT: v_trunc_f32_e32 v6, v6 -; VI-NEXT: v_mad_f32 v1, -v6, v5, v1 +; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; VI-NEXT: v_rcp_f32_e32 v7, v6 @@ -838,7 +832,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; VI-NEXT: v_trunc_f32_e32 v5, v5 -; VI-NEXT: v_mad_f32 v0, -v5, v4, v0 +; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm <4 x float> addrspace(1)* %in2) #0 { @@ -902,8 +896,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] -; SI-NEXT: v_mul_f64 v[6:7], v[8:9], v[6:7] -; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] ; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 @@ -934,8 +927,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; SI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] -; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -970,8 +962,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] ; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 @@ -985,8 +976,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -1019,8 +1009,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] ; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] -; VI-NEXT: v_mul_f64 v[6:7], v[10:11], v[6:7] -; VI-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] ; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 @@ -1034,8 +1023,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm <2 x double> addrspace(1)* %in2) #0 { -- 2.7.4