From 91b0cad8132997060182146b2734065bc807e9fa Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 4 Nov 2019 14:06:04 +0000
Subject: [PATCH] [ARM] Use isFMAFasterThanFMulAndFAdd for MVE

The Arm backend will usually return false for isFMAFasterThanFMulAndFAdd,
where both the fused VFMA.f32 and a non-fused VMLA.f32 are usually
available for scalar code. For MVE we don't have the non-fused version
though. It makes more sense for isFMAFasterThanFMulAndFAdd to return
true, allowing us to simplify some of the existing ISel patterns.

The tests here are that non of the existing tests failed, and so we are
still selecting VFMA and VFMS. The one test that changed shows we can
now select from fast math flags, as opposed to just relying on the
isFMADLegalForFAddFSub option.

Differential Revision: https://reviews.llvm.org/D69115
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp            | 30 ++++++++++++++++++++++
 llvm/lib/Target/ARM/ARMISelLowering.h              | 11 +-------
 llvm/lib/Target/ARM/ARMInstrMVE.td                 | 24 +++--------------
 .../Thumb2/LowOverheadLoops/fast-fp-loops.ll       | 13 +++++-----
 4 files changed, 41 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6e511e6..4464fd1 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14858,6 +14858,36 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
   return -1;
 }
 
+/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+/// expanded to FMAs when this method returns true, otherwise fmuladd is
+/// expanded to fmul + fadd.
+///
+/// ARM supports both fused and unfused multiply-add operations; we already
+/// lower a pair of fmul and fadd to the latter so it's not clear that there
+/// would be a gain or that the gain would be worthwhile enough to risk
+/// correctness bugs.
+///
+/// For MVE, we set this to true as it helps simplify the need for some
+/// patterns (and we don't have the non-fused floating point instruction).
+bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  if (!Subtarget->hasMVEFloatOps())
+    return false;
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::v4f32:
+  case MVT::v8f16:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
   if (V < 0)
     return false;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 53813fa..d3caed8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -738,16 +738,7 @@ class VectorType;
     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                           SmallVectorImpl<SDNode *> &Created) const override;
 
-    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-    /// expanded to FMAs when this method returns true, otherwise fmuladd is
-    /// expanded to fmul + fadd.
-    ///
-    /// ARM supports both fused and unfused multiply-add operations; we already
-    /// lower a pair of fmul and fadd to the latter so it's not clear that there
-    /// would be a gain or that the gain would be worthwhile enough to risk
-    /// correctness bugs.
-    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index e43d643..040b6f6 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2808,31 +2808,15 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
 def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
     (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
 
-let Predicates = [HasMVEFloat, UseFusedMAC] in {
-  def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1),
-                             (fmul (v8f16 MQPR:$src2),
-                                      (v8f16 MQPR:$src3)))),
-            (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>;
-  def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1),
-                             (fmul (v4f32 MQPR:$src2),
-                                      (v4f32 MQPR:$src3)))),
-            (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>;
-
-  def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1),
-                             (fmul (v8f16 MQPR:$src2),
-                                      (v8f16 MQPR:$src3)))),
-            (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>;
-  def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1),
-                             (fmul (v4f32 MQPR:$src2),
-                                      (v4f32 MQPR:$src3)))),
-            (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>;
-}
-
 let Predicates = [HasMVEFloat] in {
   def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
             (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
   def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
             (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
+  def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
+            (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
+  def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
+            (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
 }
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 234328c..1b4b0405 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -234,7 +234,7 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    cbz r2, .LBB1_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
@@ -242,21 +242,20 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q1, [r0]
-; CHECK-NEXT:    vldrwt.u32 q2, [r1]
+; CHECK-NEXT:    vldrwt.u32 q2, [r0]
+; CHECK-NEXT:    vldrwt.u32 q3, [r1]
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    vmul.f32 q1, q2, q1
 ; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vadd.f32 q1, q1, q0
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vfma.f32 q0, q3, q2
 ; CHECK-NEXT:    le lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
 ; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-- 
2.7.4