[AMDGPU] Convert mac/fmac to mad/fma when folding output modifiers

author Jay Foad <jay.foad@amd.com>

Tue, 21 Sep 2021 12:03:23 +0000 (13:03 +0100)

committer Jay Foad <jay.foad@amd.com>

Wed, 22 Sep 2021 08:36:34 +0000 (09:36 +0100)
author Jay Foad <jay.foad@amd.com>
Tue, 21 Sep 2021 12:03:23 +0000 (13:03 +0100)
committer Jay Foad <jay.foad@amd.com>
Wed, 22 Sep 2021 08:36:34 +0000 (09:36 +0100)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

index 6bf710f..6790132 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1388,6 +1388,14 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
    DefClamp->setImm(1);
    MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
    MI.eraseFromParent();
+
+  // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
+  // instruction, so we might as well convert it to the more flexible VOP3-only
+  // mad/fma form.
+  MachineFunction::iterator MBBI = Def->getParent()->getIterator();
+  if (MachineInstr *NewMI = TII->convertToThreeAddress(MBBI, *Def, nullptr))
+    Def->eraseFromParent();
+
    return true;
  }
  
@@ -1526,6 +1534,14 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
    DefOMod->setImm(OMod);
    MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
    MI.eraseFromParent();
+
+  // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
+  // instruction, so we might as well convert it to the more flexible VOP3-only
+  // mad/fma form.
+  MachineFunction::iterator MBBI = Def->getParent()->getIterator();
+  if (MachineInstr *NewMI = TII->convertToThreeAddress(MBBI, *Def, nullptr))
+    Def->eraseFromParent();
+
    return true;
  }
  
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

index 40cc3b2..ec9c22c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -70,7 +70,7 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
  ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
  ; GFX9-NEXT: s_setpc_b64
  
-; CIVI: v_mac_f32_e64 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
+; CIVI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
  define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 {
    %src0.ext = fpext half %src0 to float
    %src1.ext = fpext half %src1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll

index 7240a5c..6cf3732 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -328,8 +328,7 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
  ; GCN-LABEL: {{^}}v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
  ; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding
  ; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding
-; VI: v_mac_f32_e64 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
-; CI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
+; CIVI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
  define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
    %src0.hi = extractelement <2 x half> %src0, i32 1
    %src1.hi = extractelement <2 x half> %src1, i32 1
author	Jay Foad <jay.foad@amd.com>
	Tue, 21 Sep 2021 12:03:23 +0000 (13:03 +0100)
committer	Jay Foad <jay.foad@amd.com>
	Wed, 22 Sep 2021 08:36:34 +0000 (09:36 +0100)
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/mad-mix.ll		patch \| blob \| history