From afa0ed33df0717e73232f72ff8acee279f285140 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 8 Sep 2022 14:36:23 +0100 Subject: [PATCH] [AMDGPU] Fix shrinking of F16 FMA on newer subtargets D125803 introduced shrinking of F16 FMA to FMAAK/FMAMK in SIShrinkInstructions (useful on GFX10+ where VOP3 instructions may have a literal operand) but failed to handle the V_FMA_F16_gfx9_e64 form of the opcode which is used on GFX9+. Differential Revision: https://reviews.llvm.org/D133489 --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 ++++++--- llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 5 ++++- llvm/test/CodeGen/AMDGPU/fma.f16.ll | 23 ++++++++--------------- llvm/test/CodeGen/AMDGPU/gfx10-shrink-mad-fma.mir | 8 ++++---- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 73844ca..d699166 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2887,12 +2887,15 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { static constexpr unsigned ModifierOpNames[] = { AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, - AMDGPU::OpName::omod}; + AMDGPU::OpName::omod, AMDGPU::OpName::op_sel}; void SIInstrInfo::removeModOperands(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); - for (unsigned Name : reverse(ModifierOpNames)) - MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, Name)); + for (unsigned Name : reverse(ModifierOpNames)) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, Name); + if (Idx >= 0) + MI.removeOperand(Idx); + } } bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 05d2dd0..421799e4 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -382,6 +382,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { NewOpcode = AMDGPU::V_MADAK_F16; break; case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_FMA_F16_gfx9_e64: NewOpcode = AMDGPU::V_FMAAK_F16; break; } @@ -409,6 +410,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { NewOpcode = AMDGPU::V_MADMK_F16; break; case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_FMA_F16_gfx9_e64: NewOpcode = AMDGPU::V_FMAMK_F16; break; } @@ -852,7 +854,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || - MI.getOpcode() == AMDGPU::V_FMA_F16_e64) { + MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { shrinkMadFma(MI); continue; } diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 20d39ef..d4eaf66 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10 declare half @llvm.fma.f16(half, half, half) declare half @llvm.maxnum.f16(half, half) @@ -58,19 +58,12 @@ define half @test_fmaak(half %x, half %y, half %z) { ; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: test_fmaak: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SDAG-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: test_fmaak: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-GISEL-NEXT: v_fma_f16 v0, v0, v1, 0x4200 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: test_fmaak: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200) ret half %r } diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx10-shrink-mad-fma.mir index 198c5cb..9ace3f1 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx10-shrink-mad-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx10-shrink-mad-fma.mir @@ -192,7 +192,7 @@ body: | ; GFX10-NEXT: SI_RETURN implicit [[V_FMAMK_F16_]] %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_FMA_F16_e64 0, 18688, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_FMA_F16_gfx9_e64 0, 18688, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit %2 ... @@ -207,7 +207,7 @@ body: | ; GFX10-NEXT: SI_RETURN implicit [[V_FMAMK_F16_]] %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_FMA_F16_e64 0, %0, 0, 18688, 0, %1, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_FMA_F16_gfx9_e64 0, %0, 0, 18688, 0, %1, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit %2 ... @@ -222,7 +222,7 @@ body: | ; GFX10-NEXT: SI_RETURN implicit [[V_FMAAK_F16_]] %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_FMA_F16_e64 0, %0, 0, %1, 0, 18688, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_FMA_F16_gfx9_e64 0, %0, 0, %1, 0, 18688, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit %2 ... @@ -237,6 +237,6 @@ body: | ; GFX10-NEXT: SI_RETURN implicit [[V_FMAAK_F16_]] %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32 = IMPLICIT_DEF - %2:vgpr_32 = V_FMA_F16_e64 0, %0, 0, %1, 0, 18688, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_FMA_F16_gfx9_e64 0, %0, 0, %1, 0, 18688, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit %2 ... -- 2.7.4