From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 12 Mar 2020 09:57:48 +0000 (+0000)
Subject: [ARM,MVE] Add intrinsics and isel for MVE fused multiply-add.
X-Git-Tag: 2020.06-alpha~93^2~545
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3f8e714e2f9f2dc3367d2f3fc569abfaf28f314c;p=platform%2Fupstream%2Fllvm.git

[ARM,MVE] Add intrinsics and isel for MVE fused multiply-add.

Summary:
This adds the ACLE intrinsic family for the VFMA and VFMS
instructions, which perform fused multiply-add on vectors of floats.

I've represented the unpredicated versions in IR using the cross-
platform `@llvm.fma` IR intrinsic. We already had isel rules to
convert one of those into a vector VFMA in the simplest possible way;
but we didn't have rules to detect a negated argument and turn it into
VFMS, or rules to detect a splat argument and turn it into one of the
two vector/scalar forms of the instruction. Now we have all of those.

The predicated form uses a target-specific intrinsic as usual, but
I've stuck to just one, for a predicated FMA. The subtraction and
splat versions are code-generated by passing an fneg or a splat as one
of its operands, the same way as the unpredicated version.

In arm_mve_defs.h, I've had to introduce a tiny extra piece of
infrastructure: a record `id` for use in codegen dags which implements
the identity function. (Just because you can't declare a Tablegen
value of type dag which is //only// a `$varname`: you have to wrap it
in something. Now I can write `(id $varname)` to get the same effect.)

Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D75998
---

diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index d9a2035e8a0e..d2203d650301 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -162,6 +162,46 @@ let pnt = PNT_NType in {
 }
 }
 
+multiclass FMA<bit add> {
+  // FMS instructions are defined in the ArmARM as if they negate the
+  // second multiply input.
+  defvar m2_cg = !if(add, (id $m2), (fneg $m2));
+
+  defvar unpred_cg = (IRIntBase<"fma", [Vector]> $m1, m2_cg, $addend);
+  defvar pred_cg   = (IRInt<"fma_predicated", [Vector, Predicate]>
+                          $m1, m2_cg, $addend, $pred);
+
+  def q: Intrinsic<Vector, (args Vector:$addend, Vector:$m1, Vector:$m2),
+                   unpred_cg>;
+
+  def q_m: Intrinsic<Vector, (args Vector:$addend, Vector:$m1, Vector:$m2,
+                                   Predicate:$pred), pred_cg>;
+
+  // Only FMA has the vector/scalar variants, not FMS
+  if add then let pnt = PNT_NType in {
+
+    def q_n: Intrinsic<Vector, (args Vector:$addend, Vector:$m1,
+                                     unpromoted<Scalar>:$m2_s),
+                     (seq (splat $m2_s):$m2, unpred_cg)>;
+    def sq_n: Intrinsic<Vector, (args Vector:$m1, Vector:$m2,
+                                      unpromoted<Scalar>:$addend_s),
+                        (seq (splat $addend_s):$addend, unpred_cg)>;
+    def q_m_n: Intrinsic<Vector, (args Vector:$addend, Vector:$m1,
+                                       unpromoted<Scalar>:$m2_s,
+                                       Predicate:$pred),
+                         (seq (splat $m2_s):$m2, pred_cg)>;
+    def sq_m_n: Intrinsic<Vector, (args Vector:$m1, Vector:$m2,
+                                        unpromoted<Scalar>:$addend_s,
+                                        Predicate:$pred),
+                          (seq (splat $addend_s):$addend, pred_cg)>;
+  }
+}
+
+let params = T.Float in {
+  defm vfma: FMA<1>;
+  defm vfms: FMA<0>;
+}
+
 let params = !listconcat(T.Int16, T.Int32) in {
   let pnt = PNT_None in {
     def vmvnq_n: Intrinsic<Vector, (args imm_simd_vmvn:$imm),
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index f1424f2ea594..4038a18027f8 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -133,6 +133,18 @@ def unzip: CGHelperFn<"VectorUnzip"> {
 }
 def zip: CGHelperFn<"VectorZip">;
 
+// Trivial 'codegen' function that just returns its argument. Useful
+// for wrapping up a variable name like $foo into a thing you can pass
+// around as type 'dag'.
+def id: IRBuilderBase {
+  // All the other cases of IRBuilderBase use 'prefix' to specify a function
+  // call, including the open parenthesis. MveEmitter puts the closing paren on
+  // the end. So if we _just_ specify an open paren with no function name
+  // before it, then the generated C++ code will simply wrap the input value in
+  // parentheses, returning it unchanged.
+  let prefix = "(";
+}
+
 // Helper for making boolean flags in IR
 def i1: IRBuilderBase {
   let prefix = "llvm::ConstantInt::get(Builder.getInt1Ty(), ";
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
new file mode 100644
index 000000000000..ab1cb14c3aed
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
@@ -0,0 +1,261 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vfmaq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[A:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+#ifdef POLYMORPHIC
+  return vfmaq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmaq_f16(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+#ifdef POLYMORPHIC
+  return vfmaq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmaq_f32(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
+#ifdef POLYMORPHIC
+  return vfmaq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmaq_n_f16(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
+#ifdef POLYMORPHIC
+  return vfmaq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmaq_n_f32(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmasq_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vfmasq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
+#ifdef POLYMORPHIC
+  return vfmasq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmasq_n_f16(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmasq_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vfmasq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
+#ifdef POLYMORPHIC
+  return vfmasq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmasq_n_f32(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmsq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fneg <8 x half> [[C:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[A:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
+float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+#ifdef POLYMORPHIC
+  return vfmsq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmsq_f16(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmsq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fneg <4 x float> [[C:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+//
+float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+#ifdef POLYMORPHIC
+  return vfmsq(a, b, c);
+#else /* POLYMORPHIC */
+  return vfmsq_f32(a, b, c);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[A:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vfmaq_m_f16(float16x8_t a, float16x8_t b, float16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmaq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmaq_m_f16(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[A:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vfmaq_m_f32(float32x4_t a, float32x4_t b, float32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmaq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmaq_m_f32(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_m_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vfmaq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmaq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmaq_m_n_f16(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmaq_m_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]], <4 x float> [[A:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vfmaq_m_n_f32(float32x4_t a, float32x4_t b, float32_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmaq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmaq_m_n_f32(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmasq_m_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vfmasq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmasq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmasq_m_n_f16(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmasq_m_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vfmasq_m_n_f32(float32x4_t a, float32x4_t b, float32_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmasq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmasq_m_n_f32(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmsq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fneg <8 x half> [[C:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[A:%.*]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
+float16x8_t test_vfmsq_m_f16(float16x8_t a, float16x8_t b, float16x8_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmsq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmsq_m_f16(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vfmsq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fneg <4 x float> [[C:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[A:%.*]], <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vfmsq_m_f32(float32x4_t a, float32x4_t b, float32x4_t c, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vfmsq_m(a, b, c, p);
+#else /* POLYMORPHIC */
+  return vfmsq_m_f32(a, b, c, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 4968689a4b56..b35e1b7c4473 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1243,6 +1243,10 @@ def int_arm_mve_vqmovn_predicated: Intrinsic<[llvm_anyvector_ty],
     llvm_i32_ty /* unsigned output */, llvm_i32_ty /* unsigned input */,
     llvm_i32_ty /* top half */, llvm_anyvector_ty /* pred */], [IntrNoMem]>;
 
+def int_arm_mve_fma_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */,
+    LLVMMatchType<0> /* addend */, llvm_anyvector_ty /* pred */], [IntrNoMem]>;
+
 // CDE (Custom Datapath Extension)
 
 def int_arm_cde_cx1: Intrinsic<
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 7600daae03c4..20856ccdb411 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3420,27 +3420,37 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
   let validForTailPredication = 1;
 }
 
-def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-
-def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms,
+                             (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar pred_int = int_arm_mve_fma_predicated;
+  defvar m1   = (VTI.Vec MQPR:$m1);
+  defvar m2   = (VTI.Vec MQPR:$m2);
+  defvar add  = (VTI.Vec MQPR:$add);
+  defvar pred = (VTI.Pred VCCR:$pred);
 
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
-            (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
-  def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
-            (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
-  def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
-            (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
-  def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
-            (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
+  let Predicates = [HasMVEFloat] in {
+    if fms then {
+      def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+      def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+    } else {
+      def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+    }
+  }
 }
 
+defm MVE_VFMAf32 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v4f32>;
+defm MVE_VFMAf16 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v8f16>;
+defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
+defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
+
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
                             SDNode unpred_op, Intrinsic pred_int> {
   def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
@@ -5184,11 +5194,39 @@ let Predicates = [HasMVEInt] in {
             (v16i8 (MVE_VMLAS_qr_u8 $src1, $src2, $x))>;
 }
 
+multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
+                             bit scalar_addend> {
+  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar pred_int = int_arm_mve_fma_predicated;
+  defvar v1   = (VTI.Vec MQPR:$v1);
+  defvar v2   = (VTI.Vec MQPR:$v2);
+  defvar s    = !if(VTI.Size{0}, (f16 HPR:$s), (f32 SPR:$s));
+  defvar vs   = (VTI.Vec (ARMvdup s));
+  defvar is   = (i32 (COPY_TO_REGCLASS s, rGPR));
+  defvar pred = (VTI.Pred VCCR:$pred);
+
+  let Predicates = [HasMVEFloat] in {
+    if scalar_addend then {
+      def : Pat<(VTI.Vec (fma v1, v2, vs)), (VTI.Vec (Inst v1, v2, is))>;
+      def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
+    } else {
+      def : Pat<(VTI.Vec (fma v1, vs, v2)), (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (fma vs, v1, v2)), (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
+                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+      def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
+                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+    }
+  }
+}
+
 let Predicates = [HasMVEFloat] in {
-  def MVE_VFMA_qr_f16  : MVE_VFMAMLA_qr<"vfma",  "f16", 0b1, 0b11, 0b0>;
-  def MVE_VFMA_qr_f32  : MVE_VFMAMLA_qr<"vfma",  "f32", 0b0, 0b11, 0b0>;
-  def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>;
-  def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>;
+  defm MVE_VFMA_qr_f16  : MVE_VFMA_qr_multi<"vfma",  MVE_v8f16, 0>;
+  defm MVE_VFMA_qr_f32  : MVE_VFMA_qr_multi<"vfma",  MVE_v4f32, 0>;
+  defm MVE_VFMA_qr_Sf16 : MVE_VFMA_qr_multi<"vfmas", MVE_v8f16, 1>;
+  defm MVE_VFMA_qr_Sf32 : MVE_VFMA_qr_multi<"vfmas", MVE_v4f32, 1>;
 }
 
 class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
index 76a00545063e..a65b663b3311 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
@@ -208,8 +208,7 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f
 ; CHECK-MVE-VMLA:       @ %bb.0: @ %entry
 ; CHECK-MVE-VMLA-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-MVE-VMLA-NEXT:    vmov r0, s8
-; CHECK-MVE-VMLA-NEXT:    vdup.16 q2, r0
-; CHECK-MVE-VMLA-NEXT:    vfma.f16 q0, q1, q2
+; CHECK-MVE-VMLA-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-MVE-VMLA-NEXT:    bx lr
 ;
 ; CHECK-MVE-LABEL: vfmar16:
@@ -275,9 +274,7 @@ define arm_aapcs_vfpcc <8 x half> @vfma16(<8 x half> %src1, <8 x half> %src2, fl
 ; CHECK-MVE-VMLA:       @ %bb.0: @ %entry
 ; CHECK-MVE-VMLA-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-MVE-VMLA-NEXT:    vmov r0, s8
-; CHECK-MVE-VMLA-NEXT:    vdup.16 q2, r0
-; CHECK-MVE-VMLA-NEXT:    vfma.f16 q2, q0, q1
-; CHECK-MVE-VMLA-NEXT:    vmov q0, q2
+; CHECK-MVE-VMLA-NEXT:    vfmas.f16 q0, q1, r0
 ; CHECK-MVE-VMLA-NEXT:    bx lr
 ;
 ; CHECK-MVE-LABEL: vfma16:
@@ -419,8 +416,7 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32(<4 x float> %src1, <4 x float> %src2
 ; CHECK-MVE-VMLA-LABEL: vfmar32:
 ; CHECK-MVE-VMLA:       @ %bb.0: @ %entry
 ; CHECK-MVE-VMLA-NEXT:    vmov r0, s8
-; CHECK-MVE-VMLA-NEXT:    vdup.32 q2, r0
-; CHECK-MVE-VMLA-NEXT:    vfma.f32 q0, q1, q2
+; CHECK-MVE-VMLA-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-MVE-VMLA-NEXT:    bx lr
 ;
 ; CHECK-MVE-LABEL: vfmar32:
@@ -449,9 +445,7 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32(<4 x float> %src1, <4 x float> %src2
 ; CHECK-MVE-VMLA-LABEL: vfmas32:
 ; CHECK-MVE-VMLA:       @ %bb.0: @ %entry
 ; CHECK-MVE-VMLA-NEXT:    vmov r0, s8
-; CHECK-MVE-VMLA-NEXT:    vdup.32 q2, r0
-; CHECK-MVE-VMLA-NEXT:    vfma.f32 q2, q0, q1
-; CHECK-MVE-VMLA-NEXT:    vmov q0, q2
+; CHECK-MVE-VMLA-NEXT:    vfmas.f32 q0, q1, r0
 ; CHECK-MVE-VMLA-NEXT:    bx lr
 ;
 ; CHECK-MVE-LABEL: vfmas32:
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll
new file mode 100644
index 000000000000..fcdb29f7e2f2
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; CHECK-LABEL: test_vfmaq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfma.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test_vfmaq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfma.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) {
+; CHECK-LABEL: test_vfmaq_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %c.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %.splat, <8 x half> %a)
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) {
+; CHECK-LABEL: test_vfmaq_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vfma.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %.splat, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmasq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) {
+; CHECK-LABEL: test_vfmasq_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vfmas.f16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %c.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %.splat)
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmasq_n_f32(<4 x float> %a, <4 x float> %b, float %c) {
+; CHECK-LABEL: test_vfmasq_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vfmas.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %.splat)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; CHECK-LABEL: test_vfmsq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfms.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fneg <8 x half> %c
+  %1 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %0, <8 x half> %a)
+  ret <8 x half> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test_vfmsq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfms.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fneg <4 x float> %c
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %0, <4 x float> %a)
+  ret <4 x float> %1
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmaq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmat.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %c, <8 x half> %a, <8 x i1> %1)
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmaq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmat.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %c, <4 x float> %a, <4 x i1> %1)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmaq_m_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmat.f16 q0, q1, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %c.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %.splat, <8 x half> %a, <8 x i1> %3)
+  ret <8 x half> %4
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmaq_m_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmat.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %.splat, <4 x float> %a, <4 x i1> %1)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmasq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmasq_m_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmast.f16 q0, q1, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %c.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x half> %.splat, <8 x i1> %3)
+  ret <8 x half> %4
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmasq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmasq_m_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmast.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x float> %.splat, <4 x i1> %1)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vfmsq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmsq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmst.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fneg <8 x half> %c
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %0, <8 x half> %a, <8 x i1> %2)
+  ret <8 x half> %3
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vfmsq_m_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, i16 zeroext %p) {
+; CHECK-LABEL: test_vfmsq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmst.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fneg <4 x float> %c
+  %1 = zext i16 %p to i32
+  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %0, <4 x float> %a, <4 x i1> %2)
+  ret <4 x float> %3
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x half>, <8 x i1>)
+declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)