From 86dcb592069f2d18a183fa1daa611029ae80ef4c Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 20 Sep 2021 14:20:28 +0100 Subject: [PATCH] [AMDGPU] Prefer v_fmac over v_fma only when no source modifiers are used v_fmac with source modifiers forces VOP3 encoding, but it is strictly better to use the VOP3-only v_fma instead, because $dst and $src2 are not tied so it gives the register allocator more freedom and avoids a copy in some cases. This is the same strategy we already use for v_mad vs v_mac and v_fma_legacy vs v_fmac_legacy. Differential Revision: https://reviews.llvm.org/D110070 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 26 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 190 ++++++++++----------- llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 10 +- .../AMDGPU/GlobalISel/inst-select-fma.s32.mir | 8 +- llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 8 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 8 +- llvm/test/CodeGen/AMDGPU/fma.f64.ll | 24 +-- ...fmad-formation-fmul-distribute-denormal-mode.ll | 14 +- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 7 +- llvm/test/CodeGen/AMDGPU/frem.ll | 83 +++++---- llvm/test/CodeGen/AMDGPU/mad-mix.ll | 2 +- llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll | 8 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 2 +- 14 files changed, 194 insertions(+), 202 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 10f0813..4c484b6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2305,31 +2305,37 @@ let SubtargetPredicate = NotHasMinMaxDenormModes in { let OtherPredicates = [HasDLInsts] in { +// Don't allow source modifiers. If there are any source modifiers then it's +// better to select fma instead of fmac. def : GCNPat < - (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (fma (f32 (VOP3NoMods f32:$src0)), + (f32 (VOP3NoMods f32:$src1)), (f32 (VOP3NoMods f32:$src2))), - (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; } // End OtherPredicates = [HasDLInsts] let SubtargetPredicate = isGFX10Plus in +// Don't allow source modifiers. If there are any source modifiers then it's +// better to select fma instead of fmac. def : GCNPat < - (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)), - (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (fma (f16 (VOP3NoMods f32:$src0)), + (f16 (VOP3NoMods f32:$src1)), (f16 (VOP3NoMods f32:$src2))), - (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; let SubtargetPredicate = isGFX90APlus in +// Don't allow source modifiers. If there are any source modifiers then it's +// better to select fma instead of fmac. def : GCNPat < - (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), - (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)), + (fma (f64 (VOP3NoMods f64:$src0)), + (f64 (VOP3NoMods f64:$src1)), (f64 (VOP3NoMods f64:$src2))), - (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, - SRCMODS.NONE, $src2, $clamp, $omod) + (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2) >; // COPY is workaround tablegen bug from multiple outputs diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 110d9c6..7cd475b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -90,10 +90,10 @@ define float @v_fdiv_f32(float %a, float %b) { ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -108,11 +108,11 @@ define float @v_fdiv_f32(float %a, float %b) { ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b @@ -194,10 +194,10 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) { ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -295,10 +295,10 @@ define float @v_rcp_f32(float %x) { ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -313,11 +313,11 @@ define float @v_rcp_f32(float %x) { ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x @@ -403,10 +403,10 @@ define float @v_rcp_f32_arcp(float %x) { ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -421,11 +421,11 @@ define float @v_rcp_f32_arcp(float %x) { ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float 1.0, %x @@ -566,10 +566,10 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) { ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -722,15 +722,15 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -746,24 +746,24 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v8, v7, -v4, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v6, -v4, v7 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6 -; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b @@ -884,15 +884,15 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1054,15 +1054,15 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) { ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1078,24 +1078,24 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) { ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x @@ -1236,15 +1236,15 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) { ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1260,24 +1260,24 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) { ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> , %x @@ -1475,15 +1475,15 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 4e76638..d5b36e0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -498,7 +498,7 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, v1, |v0|, v2 +; GFX10-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z) @@ -528,7 +528,7 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, |v1|, v0, v2 +; GFX10-NEXT: v_fma_f32 v0, v0, |v1|, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z) @@ -558,7 +558,7 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2 +; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) @@ -668,7 +668,7 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, v1, -v0, v2 +; GFX10-NEXT: v_fma_f32 v0, -v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z) @@ -698,7 +698,7 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, -v1, v0, v2 +; GFX10-NEXT: v_fma_f32 v0, v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir index 7e8b4a9..417937b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir @@ -59,13 +59,13 @@ body: | ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4 ; GFX10-LABEL: name: fma_f32_fneg_src0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit %4 %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -96,13 +96,13 @@ body: | ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4 ; GFX10-LABEL: name: fma_f32_fneg_src1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit %4 %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll index 47f66c7..661b1ac 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -60,13 +60,13 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GCN-NEXT: v_mac_f32_e32 v10, v7, v6 ; GCN-NEXT: v_mul_f32_e32 v1, v8, v6 ; GCN-NEXT: v_mul_f32_e32 v7, v6, v3 -; GCN-NEXT: v_fmac_f32_e64 v9, -v6, v3 +; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v3, v4, v10 +; GCN-NEXT: v_add_f32_e32 v4, v4, v10 ; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1 +; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6 +; GCN-NEXT: v_mul_f32_e32 v3, v4, v6 ; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a -; GCN-NEXT: v_fmac_f32_e32 v7, v9, v6 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v6 ; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6 ; GCN-NEXT: v_mul_f32_e32 v0, v2, v6 ; GCN-NEXT: v_mul_f32_e32 v2, v7, v4 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index e6df206..a11c864 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -59,9 +59,9 @@ entry: ; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] ; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] -; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] +; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] -; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]] +; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] ; GFX10-NOT: s_denorm_mode ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] @@ -331,9 +331,9 @@ entry: ; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] ; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] -; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] +; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] -; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]] +; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] ; GFX10-NOT: s_denorm_mode ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll index b98ee9c..fa7d902 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s declare double @llvm.fma.f64(double, double, double) nounwind readnone declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone @@ -55,8 +55,7 @@ define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x doubl } ; FUNC-LABEL: {{^}}fma_f64_abs_src0: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\]}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -69,8 +68,7 @@ define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double ad } ; FUNC-LABEL: {{^}}fma_f64_abs_src1: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -97,8 +95,7 @@ define amdgpu_kernel void @fma_f64_abs_src2(double addrspace(1)* %out, double ad } ; FUNC-LABEL: {{^}}fma_f64_neg_src0: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -111,8 +108,7 @@ define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double ad } ; FUNC-LABEL: {{^}}fma_f64_neg_src1: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -139,8 +135,7 @@ define amdgpu_kernel void @fma_f64_neg_src2(double addrspace(1)* %out, double ad } ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src0: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -154,8 +149,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, doubl } ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src1: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll index 84d0f66..00decf3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll +++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll @@ -71,7 +71,7 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0 ; FMAGFX10: ; %bb.0: ; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0 +; FMAGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32: @@ -84,7 +84,7 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0 ; FMADGFX10: ; %bb.0: ; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0 +; FMADGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0 ; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fsub fast float 1.0, %arg1 %tmp1 = fmul fast float %arg0, %add @@ -156,8 +156,8 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2 ; FMAGFX10: ; %bb.0: ; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0 -; FMAGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1 +; FMAGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0 +; FMAGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32: @@ -171,8 +171,8 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2 ; FMADGFX10: ; %bb.0: ; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0 -; FMADGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1 +; FMADGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0 +; FMADGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1 ; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fsub fast <2 x float> , %arg1 %tmp1 = fmul fast <2 x float> %arg0, %add @@ -236,7 +236,7 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, < ; FMAGFX10: ; %bb.0: ; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; FMAGFX10-NEXT: v_fma_f32 v0, v1, -v0, v1 +; FMAGFX10-NEXT: v_fma_f32 v0, -v0, v1, v1 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; ; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize: diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index c18da03..4623129 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -360,8 +360,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]] +; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] @@ -370,9 +369,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 061fb56..942f23e 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -156,7 +156,7 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm half addrspace(1)* %in2) #0 { @@ -280,7 +280,7 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm half addrspace(1)* %in2) #0 { @@ -404,7 +404,7 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm half addrspace(1)* %in2) #1 { @@ -575,7 +575,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2 +; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm float addrspace(1)* %in2) #0 { @@ -691,7 +691,7 @@ define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspa ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2 +; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm float addrspace(1)* %in2) #0 { @@ -807,7 +807,7 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2 +; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm float addrspace(1)* %in2) #1 { @@ -1534,22 +1534,21 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v4, -v3, v2 +; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 -; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1 +; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX10-NEXT: v_trunc_f16_e32 v4, v4 +; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 +; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %in2) #0 { @@ -1899,42 +1898,40 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3 +; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 +; GFX10-NEXT: v_trunc_f16_e32 v6, v6 +; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1 +; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2 +; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 -; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2 -; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %in2) #0 { @@ -2173,7 +2170,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1 +; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5 @@ -2188,7 +2185,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2 +; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <2 x float> addrspace(1)* %in2) #0 { @@ -2547,7 +2544,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9 -; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3 +; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 ; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 ; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9 @@ -2562,7 +2559,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7 -; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2 +; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 ; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 ; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7 @@ -2577,7 +2574,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6 -; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1 +; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 @@ -2592,7 +2589,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4 +; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm <4 x float> addrspace(1)* %in2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index fa3df02..7240a5c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -356,7 +356,7 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { ; GCN: s_waitcnt ; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 ; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2 -; GFX906-NEXT: v_fma_f32 v0, v1, |v0|, v2 +; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; GCN-NEXT: s_setpc_b64 define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { %src0.fabs = call float @llvm.fabs.f32(float %src0) diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 773838e..396f7b4 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -176,8 +176,7 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, ha ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fmac_f16_e64 v2, -v0, -v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_fma_f16 v0, -v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %neg.y = fneg half %y @@ -196,8 +195,7 @@ define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, ha ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fmac_f16_e64 v2, |v0|, |v1| -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = call half @llvm.fabs.f16(half %x) %neg.y = call half @llvm.fabs.f16(half %y) diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll index 6d7531e..0c7d6e6 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll @@ -111,7 +111,7 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, -v1, -v0, v2 +; GFX10-NEXT: v_fma_f32 v0, -v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %neg.y = fneg float %y @@ -130,7 +130,7 @@ define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2 +; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = call float @llvm.fabs.f32(float %x) %neg.y = call float @llvm.fabs.f32(float %y) @@ -150,8 +150,8 @@ define <2 x float> @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f32 v0, -v2, -v0, v4 -; GFX10-NEXT: v_fma_f32 v1, -v3, -v1, v5 +; GFX10-NEXT: v_fma_f32 v0, -v0, -v2, v4 +; GFX10-NEXT: v_fma_f32 v1, -v1, -v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x float> %x %neg.y = fneg <2 x float> %y diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 149d7b4..ca6190e 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -187,7 +187,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; GCN-LABEL: {{^}}fdiv_test_denormals ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1030: v_fmac_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} +; GFX1030: v_fma_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) { bb: %tmp = load i8, i8 addrspace(1)* null, align 1 -- 2.7.4