From 489f62e8011f54fc94d9c3a4cc2d1d66d3b5bc49 Mon Sep 17 00:00:00 2001
From: Mikhail Maltsev <mikhail.maltsev@arm.com>
Date: Mon, 17 Feb 2020 17:47:05 +0000
Subject: [PATCH] [ARM,MVE] Add vector-scalar intrinsics

Summary:
This patch adds vector-scalar variants to the following families of
MVE intrinsics:
* vaddq
* vsubq
* vmulq
* vqaddq
* vqsubq
* vhaddq
* vhsubq
* vqdmulhq
* vqrdmulhq

The vector-scalar variants perform a splat operation on the scalar
operand and then perform the same operations as their vector-vector
counterparts. Code generation is done accordingly (using LLVM IR 'insert'
and 'shuffle' operations which are later converted into an ARMvdup
SDNode).

Reviewers: simon_tatham, dmgreen, MarkMurrayARM, ostannard

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D74620
---
 clang/include/clang/Basic/arm_mve.td               |  68 ++++++
 clang/include/clang/Basic/arm_mve_defs.td          |  23 ++
 clang/test/CodeGen/arm-mve-intrinsics/vaddq.c      | 114 +++++++++-
 clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c     | 156 +++++++++++++
 clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c     | 156 +++++++++++++
 clang/test/CodeGen/arm-mve-intrinsics/vmulq.c      | 218 +++++++++++++++++-
 clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c     | 102 +++++++++
 clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c   | 102 +++++++++
 clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c  | 102 +++++++++
 clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c     | 102 +++++++++
 clang/test/CodeGen/arm-mve-intrinsics/vsubq.c      | 114 +++++++++-
 llvm/lib/Target/ARM/ARMInstrMVE.td                 | 249 +++++++++++++++------
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll   |  96 ++++++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll  | 131 +++++++++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll  | 132 +++++++++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll   | 181 +++++++++++++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll  |  84 +++++++
 .../test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll |  84 +++++++
 .../CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll     |  84 +++++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll  |  84 +++++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll   |  96 ++++++++
 21 files changed, 2399 insertions(+), 79 deletions(-)
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 4edef93..5cd88b0 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -43,6 +43,12 @@ def vqaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                               (IRIntBase<"sadd_sat", [Vector]> $a, $b)>;
 def vqsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                               (IRIntBase<"ssub_sat", [Vector]> $a, $b)>;
+let pnt = PNT_NType in {
+  def vqaddq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (IRIntBase<"sadd_sat", [Vector]> $a, (splat $b))>;
+  def vqsubq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (IRIntBase<"ssub_sat", [Vector]> $a, (splat $b))>;
+}
 }
 let params = T.Unsigned in {
 def vqaddq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
@@ -51,6 +57,14 @@ def vqaddq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
 def vqsubq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                                 (IRIntBase<"usub_sat", [Vector]> $a, $b)>,
               NameOverride<"vqsubq">;
+let pnt = PNT_NType in {
+  def vqaddq_u_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                    (IRIntBase<"uadd_sat", [Vector]> $a, (splat $b))>,
+                  NameOverride<"vqaddq_n">;
+  def vqsubq_u_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                    (IRIntBase<"usub_sat", [Vector]> $a, (splat $b))>,
+                  NameOverride<"vqsubq_n">;
+}
 }
 
 // Some intrinsics below are implemented not as IR fragments, but as
@@ -85,12 +99,32 @@ def vmullbq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
 def vmulltq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
                                       (IRInt<"vmull", [DblVector, Vector]>
                                        $a, $b, (unsignedflag Scalar), 1)>;
+let pnt = PNT_NType in {
+  def vaddq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                 (add $a, (splat $b))>;
+  def vsubq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                 (sub $a, (splat $b))>;
+  def vmulq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                 (mul $a, (splat $b))>;
+  def vhaddq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (IRInt<"vhadd", [Vector]> $a, (splat $b),
+                                  (unsignedflag Scalar))>;
+  def vhsubq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (IRInt<"vhsub", [Vector]> $a, (splat $b),
+                                  (unsignedflag Scalar))>;
+}
 }
 let params = T.Signed in {
 def vqdmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                                 (IRInt<"vqdmulh", [Vector]> $a, $b)>;
 def vqrdmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                                  (IRInt<"vqrdmulh", [Vector]> $a, $b)>;
+let pnt = PNT_NType in {
+  def vqdmulhq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                    (IRInt<"vqdmulh", [Vector]> $a, (splat $b))>;
+  def vqrdmulhq_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                    (IRInt<"vqrdmulh", [Vector]> $a, (splat $b))>;
+}
 }
 
 let params = T.Poly, overrideKindLetter = "p" in {
@@ -114,6 +148,18 @@ def vsubqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fsub $a, $b)>,
             NameOverride<"vsubq">;
 def vmulqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fmul $a, $b)>,
             NameOverride<"vmulq">;
+
+let pnt = PNT_NType in {
+  def vaddqf_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (fadd $a, (splat $b))>,
+                NameOverride<"vaddq_n">;
+  def vsubqf_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (fsub $a, (splat $b))>,
+                NameOverride<"vsubq_n">;
+  def vmulqf_n: Intrinsic<Vector, (args Vector:$a, unpromoted<Scalar>:$b),
+                                  (fmul $a, (splat $b))>,
+                NameOverride<"vmulq_n">;
+}
 }
 
 let params = !listconcat(T.Int16, T.Int32) in {
@@ -217,6 +263,16 @@ multiclass VectorVectorArithmetic<string operation, dag extraArgs = (?),
            extraArgs, (? $pred, $inactive)), wantXVariant>;
 }
 
+multiclass VectorScalarArithmetic<string operation, string basename,
+                                  dag extraArgs = (?),
+                                  int wantXVariant = 1> {
+  defm "" : IntrinsicMXNameOverride<
+      Vector, (args Vector:$a, unpromoted<Scalar>:$b, Predicate:$pred),
+      !con((IRInt<operation, [Vector, Predicate]> $a, (splat $b)),
+           extraArgs, (? $pred, $inactive)), basename, wantXVariant, "_n",
+           PNT_NType, PNT_NType>;
+}
+
 multiclass VectorVectorArithmeticBitcast<string operation> {
   defm "" : IntrinsicMX<Vector, (args Vector:$a, Vector:$b,
                                  Predicate:$pred),
@@ -238,6 +294,10 @@ let params = T.Usual in {
   defm veorq : VectorVectorArithmeticBitcast<"eor_predicated">;
   defm vornq : VectorVectorArithmeticBitcast<"orn_predicated">;
   defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">;
+
+  defm : VectorScalarArithmetic<"add_predicated", "vaddq">;
+  defm : VectorScalarArithmetic<"sub_predicated", "vsubq">;
+  defm : VectorScalarArithmetic<"mul_predicated", "vmulq">;
 }
 
 multiclass DblVectorVectorArithmetic<string operation, dag extraArgs = (?)> {
@@ -260,6 +320,11 @@ let params = T.Int in {
   defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>;
   defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>;
   defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>;
+
+  defm : VectorScalarArithmetic<"qadd_predicated", "vqaddq", (? (unsignedflag Scalar)), 0>;
+  defm : VectorScalarArithmetic<"hadd_predicated", "vhaddq", (? (unsignedflag Scalar))>;
+  defm : VectorScalarArithmetic<"qsub_predicated", "vqsubq", (? (unsignedflag Scalar)), 0>;
+  defm : VectorScalarArithmetic<"hsub_predicated", "vhsubq", (? (unsignedflag Scalar))>;
 }
 let params = T.Signed in {
   defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>;
@@ -268,6 +333,9 @@ let params = T.Signed in {
                                    (IRInt<"vmina_predicated", [UVector,Predicate]> $a, $b, $pred)>;
   def vmaxaq_m: Intrinsic<UVector, (args UVector:$a, Vector:$b, Predicate:$pred),
                                    (IRInt<"vmaxa_predicated", [UVector,Predicate]> $a, $b, $pred)>;
+
+  defm : VectorScalarArithmetic<"qdmulh_predicated", "vqdmulhq", (?), 0>;
+  defm : VectorScalarArithmetic<"qrdmulh_predicated", "vqrdmulhq", (?), 0>;
 }
 
 let params = T.Poly, overrideKindLetter = "p" in {
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 2a46297..d4e8215 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -495,6 +495,29 @@ multiclass IntrinsicMX<Type rettype, dag arguments, dag cg,
   }
 }
 
+// Same as above, but with an additional parameter 'basename' which overrides
+// the C intrinsic base name
+multiclass IntrinsicMXNameOverride<Type rettype, dag arguments, dag cg,
+                                   string basename, int wantXVariant = 1,
+                                   string nameSuffix = "",
+                                   PolymorphicNameType pnt_m = PNT_Type,
+                                   PolymorphicNameType pnt_x = PNT_Type> {
+  def "_m" # nameSuffix:
+     Intrinsic<rettype, !con((args rettype:$inactive), arguments), cg>,
+     NameOverride<basename # "_m" # nameSuffix> {
+    let pnt = pnt_m;
+  }
+
+  foreach unusedVar = !if(!eq(wantXVariant, 1), [1], []<int>) in {
+    def "_x" # nameSuffix:
+      Intrinsic<rettype, arguments, (seq (undef rettype):$inactive, cg)>,
+      NameOverride<basename # "_x" # nameSuffix> {
+      let pnt = pnt_x;
+    }
+  }
+}
+
+
 // -----------------------------------------------------------------------------
 // Convenience lists of parameter types. 'T' is just a container record, so you
 // can define a typical intrinsic with 'let Params = T.Usual', or similar,
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
index 1810c7e..1904c6e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
 
 #include <arm_mve.h>
 
@@ -95,3 +95,113 @@ float16x8_t test_vaddq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
     return vaddq_x_f16(a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vaddq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vaddq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vaddq(a, b);
+#else /* POLYMORPHIC */
+    return vaddq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b)
+{
+#ifdef POLYMORPHIC
+    return vaddq(a, b);
+#else /* POLYMORPHIC */
+    return vaddq_n_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vaddq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_m_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vaddq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vaddq_m_n_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_x_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vaddq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vaddq_x_n_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vaddq_x_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vaddq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vaddq_x_n_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
index 1d97ea8..cd61bc7 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
@@ -141,3 +141,159 @@ uint32x4_t test_vhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
     return vhaddq_x_u32(a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vhaddq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vhaddq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+    return vhaddq(a, b);
+#else /* POLYMORPHIC */
+    return vhaddq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vhaddq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vhaddq(a, b);
+#else /* POLYMORPHIC */
+    return vhaddq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vhaddq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vhaddq(a, b);
+#else /* POLYMORPHIC */
+    return vhaddq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vhaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vhaddq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vhaddq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vhaddq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_x_n_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vhaddq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_x_n_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhaddq_x_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vhaddq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhaddq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhaddq_x_n_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
index 633fd32..5299369 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
@@ -93,3 +93,159 @@ int32x4_t test_vhsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
     return vhsubq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vhsubq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vhsubq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+    return vhsubq(a, b);
+#else /* POLYMORPHIC */
+    return vhsubq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vhsubq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vhsubq(a, b);
+#else /* POLYMORPHIC */
+    return vhsubq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vhsubq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vhsubq(a, b);
+#else /* POLYMORPHIC */
+    return vhsubq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vhsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vhsubq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vhsubq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vhsubq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vhsubq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vhsubq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_x_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vhsubq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhsubq_x_n_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vhsubq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhsubq_x_n_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vhsubq_x_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vhsubq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vhsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vhsubq_x_n_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
index 536bc73..3619dab 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
 
 #include <arm_mve.h>
 
@@ -172,14 +172,14 @@ uint32x4_t test_vmulq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
 #endif /* POLYMORPHIC */
 }
 
-// CHECK-LABEL: @test_vmulq_m_f32(
+// CHECK-LABEL: @test_vmulq_x_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
-float32x4_t test_vmulq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+float32x4_t test_vmulq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmulq_x(a, b, p);
@@ -187,3 +187,213 @@ float32x4_t test_vmulq_m_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
     return vmulq_x_f32(a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vmulq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = mul <16 x i8> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vmulq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = mul <8 x i16> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_n_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmulq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmulq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmulq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_n_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmulq_x_n_u8(uint8x16_t a, uint8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_n_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmulq_x_n_s16(int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_n_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+// CHECK-LABEL: @test_vmulq_x_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmulq_x_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_n_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_x_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vmulq_x_n_f32(float32x4_t a, float32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_x_n_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
index 8361ab3..8ed34dd 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
     return vqaddq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vqaddq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vqaddq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+    return vqaddq(a, b);
+#else /* POLYMORPHIC */
+    return vqaddq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqaddq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vqaddq(a, b);
+#else /* POLYMORPHIC */
+    return vqaddq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vqaddq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vqaddq(a, b);
+#else /* POLYMORPHIC */
+    return vqaddq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqaddq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqaddq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vqaddq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqaddq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqaddq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqaddq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqaddq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqaddq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
index eb7e0a0..bab911f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqdmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_
     return vqdmulhq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vqdmulhq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vqdmulhq_n_s8(int8x16_t a, int8_t b)
+{
+#ifdef POLYMORPHIC
+    return vqdmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vqdmulhq_n_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vqdmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vqdmulhq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b)
+{
+#ifdef POLYMORPHIC
+    return vqdmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vqdmulhq_n_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qdmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqdmulhq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqdmulhq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qdmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vqdmulhq_m_n_s16(int16x8_t inactive, int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqdmulhq_m_n_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqdmulhq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqdmulhq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqdmulhq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
index 27b7efd3..919074e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqrdmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve
     return vqrdmulhq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vqrdmulhq_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vqrdmulhq_n_s8(int8x16_t a, int8_t b)
+{
+#ifdef POLYMORPHIC
+    return vqrdmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vqrdmulhq_n_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vqrdmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vqrdmulhq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b)
+{
+#ifdef POLYMORPHIC
+    return vqrdmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vqrdmulhq_n_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qrdmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqrdmulhq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqrdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqrdmulhq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qrdmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vqrdmulhq_m_n_s16(int16x8_t inactive, int16x8_t a, int16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqrdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqrdmulhq_m_n_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqrdmulhq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqrdmulhq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqrdmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqrdmulhq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
index c8924bf..28ced90 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
@@ -93,3 +93,105 @@ int32x4_t test_vqsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
     return vqsubq_m_s32(inactive, a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vqsubq_n_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vqsubq_n_u8(uint8x16_t a, uint8_t b)
+{
+#ifdef POLYMORPHIC
+    return vqsubq(a, b);
+#else /* POLYMORPHIC */
+    return vqsubq_n_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vqsubq_n_s16(int16x8_t a, int16_t b)
+{
+#ifdef POLYMORPHIC
+    return vqsubq(a, b);
+#else /* POLYMORPHIC */
+    return vqsubq_n_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vqsubq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vqsubq(a, b);
+#else /* POLYMORPHIC */
+    return vqsubq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vqsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqsubq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vqsubq_m_n_u16(uint16x8_t inactive, uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqsubq_m_n_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vqsubq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vqsubq_m_n_s32(int32x4_t inactive, int32x4_t a, int32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vqsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vqsubq_m_n_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
index dd11aa8..7231ae9 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
 
 #include <arm_mve.h>
 
@@ -95,3 +95,113 @@ float16x8_t test_vsubq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
     return vsubq_x_f16(a, b, p);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @test_vsubq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = sub <4 x i32> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vsubq_n_u32(uint32x4_t a, uint32_t b)
+{
+#ifdef POLYMORPHIC
+    return vsubq(a, b);
+#else /* POLYMORPHIC */
+    return vsubq_n_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b)
+{
+#ifdef POLYMORPHIC
+    return vsubq(a, b);
+#else /* POLYMORPHIC */
+    return vsubq_n_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_m_n_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[DOTSPLAT]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vsubq_m_n_s8(int8x16_t inactive, int8x16_t a, int8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_m_n_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_m_n_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vsubq_m_n_f32(float32x4_t inactive, float32x4_t a, float32_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_m_n_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_x_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vsubq_x_n_u16(uint16x8_t a, uint16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_x_n_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vsubq_x_n_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
+float16x8_t test_vsubq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vsubq_x(a, b, p);
+#else /* POLYMORPHIC */
+    return vsubq_x_n_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 8a03c50..5a2bb9c 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4480,10 +4480,65 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
   let Inst{3-0} = Rm{3-0};
 }
 
+// Patterns for vector-scalar instructions with integer operands
+multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
+                                    SDNode unpred_op, SDNode pred_op,
+                                    bit unpred_has_sign = 0,
+                                    bit pred_has_sign = 0> {
+  defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?));
+  defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?));
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated version
+    def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm),
+                                       (VTI.Vec (ARMvdup rGPR:$val))),
+                            UnpredSign)),
+              (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+    // Predicated version
+    def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm),
+                                     (VTI.Vec (ARMvdup rGPR:$val))),
+                            PredSign,
+                            (pred_op (VTI.Pred VCCR:$mask),
+                                     (VTI.Vec MQPR:$inactive)))),
+              (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+// Patterns for vector-scalar instructions with FP operands
+multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
+                                   Instruction instr_f16,
+                                   Instruction instr_f32> {
+  let Predicates = [HasMVEFloat] in {
+    // Unpredicated F16
+    def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)))),
+              (v8f16 (instr_f16 (v8f16 MQPR:$Qm),
+                                (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR))))>;
+    // Unpredicated F32
+    def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)))),
+              (v4f32 (instr_f32 (v4f32 MQPR:$Qm),
+                                (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR))))>;
+    // Predicated F16
+    def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)),
+                               (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+              (v8f16 (instr_f16 (v8f16 MQPR:$Qm),
+                                (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR)),
+                                ARMVCCThen, (v8i1 VCCR:$mask),
+                                (v8f16 MQPR:$inactive)))>;
+    // Preicated F32
+    def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)),
+                               (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+              (v4f32 (instr_f32 (v4f32 MQPR:$Qm),
+                                (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR)),
+                                ARMVCCThen, (v4i1 VCCR:$mask),
+                                (v4f32 MQPR:$inactive)))>;
+  }
+}
+
 class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
-                     bit bit_5, bit bit_12, bit bit_16,
-                     bit bit_28, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+                     bit bit_5, bit bit_12, bit bit_16, bit bit_28>
+  : MVE_qDest_rSrc<iname, suffix, ""> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = size;
@@ -4494,42 +4549,60 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
-                                bit bit_5, bit bit_12, bit bit_16,
-                                bit bit_28, list<dag> pattern=[]> {
-  def "8"  : MVE_VADDSUB_qr<iname, suffix#"8",  0b00,
-                            bit_5, bit_12, bit_16, bit_28>;
-  def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01,
-                            bit_5, bit_12, bit_16, bit_28>;
-  def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10,
-                            bit_5, bit_12, bit_16, bit_28>;
-}
-
-defm MVE_VADD_qr_i  : MVE_VADDSUB_qr_sizes<"vadd",  "i", 0b0, 0b0, 0b1, 0b0>;
-defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>;
-defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>;
-
-defm MVE_VSUB_qr_i  : MVE_VADDSUB_qr_sizes<"vsub",  "i", 0b0, 0b1, 0b1, 0b0>;
-defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
-defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
-            (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
-            (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
-            (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
-            (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
-            (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
-            (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
+// Vector-scalar add/sub
+multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                            SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  unpred_op, pred_int>;
+}
+
+multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
+
+multiclass MVE_VSUB_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;
+
+defm MVE_VADD_qr_i8  : MVE_VADD_qr_m<MVE_v16i8>;
+defm MVE_VADD_qr_i16 : MVE_VADD_qr_m<MVE_v8i16>;
+defm MVE_VADD_qr_i32 : MVE_VADD_qr_m<MVE_v4i32>;
+
+defm MVE_VSUB_qr_i8  : MVE_VSUB_qr_m<MVE_v16i8>;
+defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m<MVE_v8i16>;
+defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>;
+
+// Vector-scalar saturating add/sub
+multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                             SDNode unpred_op_s, SDNode unpred_op_u,
+                             Intrinsic pred_int> {
+  def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract,
+                          0b0, VTI.Unsigned>;
+  defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s);
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  unpred_op, pred_int, 0, 1>;
+}
+
+multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat,
+                     int_arm_mve_qadd_predicated>;
+
+multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat,
+                     int_arm_mve_qsub_predicated>;
+
+defm MVE_VQADD_qr_s8  : MVE_VQADD_qr_m<MVE_v16s8>;
+defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>;
+defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>;
+defm MVE_VQADD_qr_u8  : MVE_VQADD_qr_m<MVE_v16u8>;
+defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>;
+defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>;
+
+defm MVE_VQSUB_qr_s8  : MVE_VQSUB_qr_m<MVE_v16s8>;
+defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>;
+defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>;
+defm MVE_VQSUB_qr_u8  : MVE_VQSUB_qr_m<MVE_v16u8>;
+defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>;
+defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>;
 
 class MVE_VQDMULL_qr<string iname, string suffix, bit size,
                      bit T, string cstr="", list<dag> pattern=[]>
@@ -4566,19 +4639,34 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
   let validForTailPredication = 1;
 }
 
-def MVE_VHADD_qr_s8   : MVE_VxADDSUB_qr<"vhadd", "s8",  0b0, 0b00, 0b0>;
-def MVE_VHADD_qr_s16  : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>;
-def MVE_VHADD_qr_s32  : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>;
-def MVE_VHADD_qr_u8   : MVE_VxADDSUB_qr<"vhadd", "u8",  0b1, 0b00, 0b0>;
-def MVE_VHADD_qr_u16  : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>;
-def MVE_VHADD_qr_u32  : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                             Intrinsic unpred_int, Intrinsic pred_int> {
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
+                                  VTI, unpred_int, pred_int, 1, 1>;
+}
 
-def MVE_VHSUB_qr_s8   : MVE_VxADDSUB_qr<"vhsub", "s8",  0b0, 0b00, 0b1>;
-def MVE_VHSUB_qr_s16  : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>;
-def MVE_VHSUB_qr_s32  : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>;
-def MVE_VHSUB_qr_u8   : MVE_VxADDSUB_qr<"vhsub", "u8",  0b1, 0b00, 0b1>;
-def MVE_VHSUB_qr_u16  : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>;
-def MVE_VHSUB_qr_u32  : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>;
+multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd,
+                                       int_arm_mve_hadd_predicated>;
+
+multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub,
+                                       int_arm_mve_hsub_predicated>;
+
+defm MVE_VHADD_qr_s8  : MVE_VHADD_qr_m<MVE_v16s8>;
+defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>;
+defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>;
+defm MVE_VHADD_qr_u8  : MVE_VHADD_qr_m<MVE_v16u8>;
+defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>;
+defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>;
+
+defm MVE_VHSUB_qr_s8  : MVE_VHSUB_qr_m<MVE_v16s8>;
+defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>;
+defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>;
+defm MVE_VHSUB_qr_u8  : MVE_VHSUB_qr_m<MVE_v16u8>;
+defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
+defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
 
 let Predicates = [HasMVEFloat] in {
   def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd",  "f32", 0b0, 0b11, 0b0>;
@@ -4588,6 +4676,11 @@ let Predicates = [HasMVEFloat] in {
   def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub",  "f16", 0b1, 0b11, 0b1>;
 }
 
+defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated,
+                               MVE_VADD_qr_f16, MVE_VADD_qr_f32>;
+defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated,
+                               MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>;
+
 class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
                    bit bit_7, bit bit_17, list<dag> pattern=[]>
   : MVE_qDest_single_rSrc<iname, suffix, pattern> {
@@ -4678,9 +4771,8 @@ let Predicates = [HasMVEInt] in {
             (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
 }
 
-class MVE_VMUL_qr_int<string iname, string suffix,
-                      bits<2> size, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
+  : MVE_qDest_rSrc<iname, suffix, ""> {
 
   let Inst{28} = 0b0;
   let Inst{21-20} = size;
@@ -4691,19 +4783,16 @@ class MVE_VMUL_qr_int<string iname, string suffix,
   let validForTailPredication = 1;
 }
 
-def MVE_VMUL_qr_i8  : MVE_VMUL_qr_int<"vmul", "i8",  0b00>;
-def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
-def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
-            (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
-            (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
-            (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
+  def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  mul, int_arm_mve_mul_predicated>;
 }
 
+defm MVE_VMUL_qr_i8  : MVE_VMUL_qr_int_m<MVE_v16i8>;
+defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>;
+defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>;
+
 class MVE_VxxMUL_qr<string iname, string suffix,
                     bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
   : MVE_qDest_rSrc<iname, suffix, "", pattern> {
@@ -4716,19 +4805,37 @@ class MVE_VxxMUL_qr<string iname, string suffix,
   let Inst{5} = 0b1;
 }
 
-def MVE_VQDMULH_qr_s8   : MVE_VxxMUL_qr<"vqdmulh",  "s8",  0b0, 0b00>;
-def MVE_VQDMULH_qr_s16  : MVE_VxxMUL_qr<"vqdmulh",  "s16", 0b0, 0b01>;
-def MVE_VQDMULH_qr_s32  : MVE_VxxMUL_qr<"vqdmulh",  "s32", 0b0, 0b10>;
+multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
+                           Intrinsic int_unpred, Intrinsic int_pred> {
+  def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  int_unpred, int_pred>;
+}
 
-def MVE_VQRDMULH_qr_s8  : MVE_VxxMUL_qr<"vqrdmulh", "s8",  0b1, 0b00>;
-def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
-def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
+multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0,
+                  int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>;
+
+multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1,
+                  int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>;
+
+defm MVE_VQDMULH_qr_s8    : MVE_VQDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQDMULH_qr_s16   : MVE_VQDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQDMULH_qr_s32   : MVE_VQDMULH_qr_m<MVE_v4s32>;
+
+defm MVE_VQRDMULH_qr_s8   : MVE_VQRDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQRDMULH_qr_s16  : MVE_VQRDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 
 let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
   def MVE_VMUL_qr_f16   : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
   def MVE_VMUL_qr_f32   : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
 }
 
+defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated,
+                               MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>;
+
 class MVE_VFMAMLA_qr<string iname, string suffix,
                      bit bit_28, bits<2> bits_21_20, bit S,
                      list<dag> pattern=[]>
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
index b7becf6..03e64cc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll
@@ -91,3 +91,99 @@ entry:
 
 declare <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
 
+define arm_aapcs_vfpcc <4 x i32> @test_vaddq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vaddq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = add <4 x i32> %.splat, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_n_f16(<8 x half> %a, float %b.coerce) {
+; CHECK-LABEL: test_vaddq_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = fadd <8 x half> %.splat, %a
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_m_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vaddq_x_n_u16(<8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_x_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vaddq_x_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.f16 q0, q0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> undef)
+  ret <8 x half> %4
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
index 9319dd1..0a3935e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
@@ -133,3 +133,134 @@ entry:
   ret <4 x i32> %2
 }
 
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vhaddq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vhadd.u8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %.splat, i32 1)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vhaddq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vhadd.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vhaddq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vhadd.u32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.s8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.u16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_x_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.u8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 1, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_x_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhaddq_x_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhaddt.u32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
index 5bc8a22..ff474cd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
@@ -90,3 +90,135 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vhsubq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vhsub.u8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %.splat, i32 1)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vhsubq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vhsub.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %.splat, i32 0)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vhsubq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vhsub.u32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %.splat, i32 1)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhsubt.s8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhsubt.u16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhsubt.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_x_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhsubt.u8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 1, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_x_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhsubt.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 0, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vhsubq_x_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vhsubt.u32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
index 19ed08a..a342647 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
@@ -169,3 +169,184 @@ entry:
 
 declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
 
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vmulq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.i8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = mul <16 x i8> %.splat, %a
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vmulq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = mul <8 x i16> %.splat, %a
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vmulq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = mul <4 x i32> %.splat, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vmulq_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmul.f32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = fmul <4 x float> %.splat, %a
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.f16 q0, q1, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive)
+  ret <8 x half> %4
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_x_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.f32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef)
+  ret <4 x float> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
index b05fded..8d30218 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
@@ -90,3 +90,87 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
 declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vqaddq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqadd.u8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqaddq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqadd.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqaddq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqadd.u32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqaddq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqaddt.s8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqaddq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqaddt.u16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqaddq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqaddt.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
index 58e54a9..58cfeb3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmulhq.ll
@@ -90,3 +90,87 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqdmulhq_n_s8(<16 x i8> %a, i8 signext %b) {
+; CHECK-LABEL: test_vqdmulhq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = call <16 x i8> @llvm.arm.mve.vqdmulh.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqdmulhq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <8 x i16> @llvm.arm.mve.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqdmulhq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqdmulhq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulhq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulht.s8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.qdmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulhq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulht.s16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.qdmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqdmulhq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqdmulht.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
index 806fb7b..8c34745 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqrdmulhq.ll
@@ -90,3 +90,87 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
 declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqrdmulhq_n_s8(<16 x i8> %a, i8 signext %b) {
+; CHECK-LABEL: test_vqrdmulhq_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmulh.s8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = call <16 x i8> @llvm.arm.mve.vqrdmulh.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqrdmulhq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmulh.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <8 x i16> @llvm.arm.mve.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqrdmulhq_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmulh.s32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqrdmulhq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqrdmulhq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqrdmulht.s8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.qrdmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulhq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqrdmulhq_m_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqrdmulht.s16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.qrdmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulhq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqrdmulhq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqrdmulht.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
index 37f251c..9ceab4c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
@@ -90,3 +90,87 @@ entry:
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
 declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_n_u8(<16 x i8> %a, i8 zeroext %b) {
+; CHECK-LABEL: test_vqsubq_n_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqsub.u8 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %.splat)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_n_s16(<8 x i16> %a, i16 signext %b) {
+; CHECK-LABEL: test_vqsubq_n_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqsub.s16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %.splat)
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqsubq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqsub.u32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %.splat)
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqsubq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqsubt.s8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, i32 0, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqsubq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqsubt.u16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, i32 1, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vqsubq_m_n_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vqsubt.s32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
index 2959e61..243ff40 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll
@@ -91,3 +91,99 @@ entry:
 
 declare <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
 
+define arm_aapcs_vfpcc <4 x i32> @test_vsubq_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vsubq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vsub.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
+  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %0 = sub <4 x i32> %a, %.splat
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_n_f16(<8 x half> %a, float %b.coerce) {
+; CHECK-LABEL: test_vsubq_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vsub.f16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = fsub <8 x half> %a, %.splat
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vsubq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_m_n_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i8 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
+  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_m_n_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vsubq_x_n_u16(<8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_x_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) {
+; CHECK-LABEL: test_vsubq_x_n_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.f16 q0, q0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %b.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> undef)
+  ret <8 x half> %4
+}
-- 
2.7.4