From e6605a209cc7785ae635fee1df7ff4c136d35c17 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 12 Apr 2020 09:47:03 -0400
Subject: [PATCH] DAG: Fix wrong legality check for ISD::FMAD

Since 1725f2884175ca618d29b06e35f5c6ebd618053d, this should check
isFMADLegalForFAddFSub rather than the the plain isOperationLegal.

This would assert in a subset of cases due to an oddity in how FMAD is
selected. We will allow FMA formation pre-legalize, but not FMAD even
in cases where it would be valid.

The current hook requires passing in the root fadd/fsub. However, in
this distributed case, this would be far more complicated to pass in
the relevant operand. AMDGPU doesn't get any value from the node, and
only needs the type and is the only implementor, so I'm not sure why
we have this complexity. Just rename and expand the assert to avoid
the more complicated checks spread through the distribution logic.
---
 llvm/include/llvm/CodeGen/TargetLowering.h         |  12 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |   6 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp          |   4 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h            |   3 +-
 ...fmad-formation-fmul-distribute-denormal-mode.ll | 170 +++++++++++++++++++++
 5 files changed, 183 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index df8799f..4eb8ba7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2642,11 +2642,13 @@ public:
     return false;
   }
 
-  /// Returns true if the FADD or FSUB node passed could legally be combined with
-  /// an fmul to form an ISD::FMAD.
-  virtual bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                      const SDNode *N) const {
-    assert(N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB);
+  /// Returns true if be combined with to form an ISD::FMAD. \p N may be an
+  /// ISD::FADD, ISD::FSUB, or an ISD::FMUL which will be distributed into an
+  /// fadd/fsub.
+  virtual bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const {
+    assert((N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB ||
+            N->getOpcode() == ISD::FMUL) &&
+           "unexpected node in FMAD forming combine");
     return isOperationLegal(ISD::FMAD, N->getValueType(0));
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 60701af..c61c795 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11728,7 +11728,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -11945,7 +11945,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   const TargetOptions &Options = DAG.getTarget().Options;
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -12289,7 +12289,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
   bool HasFMAD = Options.UnsafeFPMath &&
-                 (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+                 (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 44306cf..196e361 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3946,8 +3946,8 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
   return false;
 }
 
-bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                              const SDNode *N) const {
+bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
+                                   const SDNode *N) const {
   // TODO: Check future ftz flag
   // v_mad_f32/v_mac_f32 do not support denormals.
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1e02fcd..2260034 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -358,8 +358,7 @@ public:
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
-  bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                              const SDNode *N) const override;
+  bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
 
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
new file mode 100644
index 0000000..c1e9f6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,FMA %s
+; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,FMA %s
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+
+; Check for incorrect fmad formation when distributing
+
+define float @unsafe_fmul_fadd_distribute_fast_f32(float %arg0, float %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v1, v0, v0
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mac_f32_e32 v0, v1, v0
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast float %arg1, 1.0
+  %tmp1 = fmul fast float %arg0, %add
+  ret float %tmp1
+}
+
+define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v1, 1.0, v1
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v1, v0, v0
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fsub fast float 1.0, %arg1
+  %tmp1 = fmul fast float %arg0, %add
+  ret float %tmp1
+}
+
+define <2 x float> @unsafe_fmul_fadd_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v2, v0, v0
+; FMA-NEXT:    v_fma_f32 v1, v3, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; NOFUSE-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mac_f32_e32 v0, v2, v0
+; FMAD-NEXT:    v_mac_f32_e32 v1, v3, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast <2 x float> %arg1, <float 1.0, float 1.0>
+  %tmp1 = fmul fast <2 x float> %arg0, %add
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v2, v0, v0
+; FMA-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v3, 1.0, v3
+; NOFUSE-NEXT:    v_sub_f32_e32 v2, 1.0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v2, v0, v0
+; FMAD-NEXT:    v_mad_f32 v1, -v3, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fsub  fast <2 x float> <float 1.0, float 1.0>, %arg1
+  %tmp1 = fmul fast <2 x float> %arg0, %add
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fast_fmul_fadd_distribute_post_legalize_f32(float %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, v0, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast float %arg0, 1.0
+  %splat = insertelement <2 x float> undef, float %add, i32 0
+  %tmp1 = fmul fast <2 x float> %arg1, %splat
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v0, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v0, 1.0, v0
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v0, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %sub = fsub fast float 1.0, %arg0
+  %splat = insertelement <2 x float> undef, float %sub, i32 0
+  %tmp1 = fmul fast <2 x float> %arg1, %splat
+  ret <2 x float> %tmp1
+}
+
+attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }
-- 
2.7.4