[ARM,MVE] Add intrinsics for contiguous load/stores.

author Simon Tatham <simon.tatham@arm.com>

Mon, 11 Nov 2019 17:02:39 +0000 (17:02 +0000)

committer Simon Tatham <simon.tatham@arm.com>

Wed, 13 Nov 2019 12:47:00 +0000 (12:47 +0000)
author Simon Tatham <simon.tatham@arm.com>
Mon, 11 Nov 2019 17:02:39 +0000 (17:02 +0000)
committer Simon Tatham <simon.tatham@arm.com>
Wed, 13 Nov 2019 12:47:00 +0000 (12:47 +0000)
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td

index 6e0e8ce..d2f877d 100644 (file)
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -72,6 +72,124 @@ def vcvt#half#q_m_f16: Intrinsic<
  
  } // loop over half = "b", "t"
  
+multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
+                           list<Type> same_size, list<Type> wider> {
+  // Intrinsics named with explicit memory and element sizes that match:
+  // vldrbq_?8, vldrhq_?16, vldrwq_?32.
+  let params = same_size, pnt = PNT_None in {
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr),
+                   (load (address (CPtr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
+                                 Predicate:$pred),
+                   (IRIntBase<"masked_load", [Vector, CPtr<Vector>]>
+                        (CPtr<Vector> $addr), !srl(memtype.size,3),
+                        $pred, (zeroinit Vector))>,
+         NameOverride<mnemonic # "_z">;
+  }
+
+  // Synonyms for the above, with the generic name vld1q that just means
+  // 'memory and element sizes match', and allows convenient polymorphism with
+  // the memory and element types covariant.
+  let params = same_size in {
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr),
+                   (load (address (CPtr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<"vld1q">;
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
+                                 Predicate:$pred),
+                   (IRIntBase<"masked_load", [Vector, CPtr<Vector>]>
+                        (CPtr<Vector> $addr), !srl(memtype.size,3),
+                        $pred, (zeroinit Vector))>,
+         NameOverride<"vld1q_z">;
+  }
+
+  // Intrinsics with the memory size narrower than the vector element, so that
+  // they load less than 128 bits of memory and sign/zero extend each loaded
+  // value into a wider vector lane.
+  let params = wider, pnt = PNT_None in {
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr),
+                   (extend (load (address (CPtr<NarrowedVecOf<memtype,Vector>>
+                                           $addr), !srl(memtype.size,3))),
+                           Vector, (unsignedflag Scalar))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
+                                 Predicate:$pred),
+                   (extend (IRIntBase<"masked_load",
+                                      [NarrowedVecOf<memtype,Vector>,
+                                      CPtr<NarrowedVecOf<memtype,Vector>>]>
+                                (CPtr<NarrowedVecOf<memtype,Vector>> $addr),
+                                !srl(memtype.size,3), $pred,
+                                (zeroinit NarrowedVecOf<memtype,Vector>)),
+                           Vector, (unsignedflag Scalar))>,
+         NameOverride<mnemonic # "_z">;
+  }
+}
+
+defm: contiguous_load<"vldrbq", u8, T.All8, !listconcat(T.Int16, T.Int32)>;
+defm: contiguous_load<"vldrhq", u16, T.All16, T.Int32>;
+defm: contiguous_load<"vldrwq", u32, T.All32, []>;
+
+multiclass contiguous_store<string mnemonic, PrimitiveType memtype,
+                           list<Type> same_size, list<Type> wider> {
+  // Intrinsics named with explicit memory and element sizes that match:
+  // vstrbq_?8, vstrhq_?16, vstrwq_?32.
+  let params = same_size in {
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value),
+                   (store $value,
+                          (address (Ptr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value, Predicate:$pred),
+                   (IRIntBase<"masked_store", [Vector, Ptr<Vector>]>
+                        $value, (Ptr<Vector> $addr),
+                        !srl(memtype.size,3), $pred)>,
+         NameOverride<mnemonic # "_p">;
+  }
+
+  // Synonyms for the above, with the generic name vst1q that just means
+  // 'memory and element sizes match', and allows convenient polymorphism with
+  // the memory and element types covariant.
+  let params = same_size in {
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value),
+                   (store $value,
+                          (address (Ptr<Vector> $addr), !srl(memtype.size,3)))>,
+         NameOverride<"vst1q">;
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value, Predicate:$pred),
+                   (IRIntBase<"masked_store", [Vector, Ptr<Vector>]>
+                        $value, (Ptr<Vector> $addr),
+                        !srl(memtype.size,3), $pred)>,
+         NameOverride<"vst1q_p">;
+  }
+
+  // Intrinsics with the memory size narrower than the vector element, so that
+  // they store less than 128 bits of memory, truncating each vector lane into
+  // a narrower value to store.
+  let params = wider in {
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value),
+                   (store (trunc $value, NarrowedVecOf<memtype,Vector>),
+                          (address (Ptr<NarrowedVecOf<memtype,Vector>> $addr),
+                                   !srl(memtype.size,3)))>,
+         NameOverride<mnemonic>;
+    def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
+                               Vector:$value, Predicate:$pred),
+                   (IRIntBase<"masked_store",
+                              [NarrowedVecOf<memtype,Vector>,
+                               Ptr<NarrowedVecOf<memtype,Vector>>]>
+                        (trunc $value, NarrowedVecOf<memtype,Vector>),
+                        (Ptr<NarrowedVecOf<memtype,Vector>> $addr),
+                        !srl(memtype.size,3), $pred)>,
+         NameOverride<mnemonic # "_p">;
+  }
+}
+
+defm: contiguous_store<"vstrbq", u8, T.All8, !listconcat(T.Int16, T.Int32)>;
+defm: contiguous_store<"vstrhq", u16, T.All16, T.Int32>;
+defm: contiguous_store<"vstrwq", u32, T.All32, []>;
+
  multiclass gather_base<list<Type> types, int size> {
    let params = types, pnt = PNT_None in {
      def _gather_base: Intrinsic<
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td

index 3d9333f..da6928f 100644 (file)
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -28,13 +28,31 @@ def args;
  
  // -----------------------------------------------------------------------------
  // Family of nodes for use in the codegen dag for an intrinsic, corresponding
-// roughly to operations in LLVM IR. More precisely, they correspond to calls
-// to methods of llvm::IRBuilder.
-class IRBuilder<string func_> {
-  string func = func_;          // the method name
+// to function calls that return LLVM IR nodes.
+class IRBuilderBase {
+  // The prefix of the function call, including an open parenthesis.
+  string prefix;
+
+  // Any parameters that have types that have to be treated specially by the
+  // Tablegen back end. Generally these will be types other than llvm::Value *,
+  // although not all other types need special treatment (e.g. llvm::Type *).
    list<int> address_params = []; // indices of parameters with type Address
    list<int> int_constant_params = []; // indices of plain integer parameters
  }
+class IRBuilder<string func> : IRBuilderBase {
+  // The usual case: a method called on the code gen function's instance of
+  // llvm::IRBuilder.
+  let prefix = "Builder." # func # "(";
+}
+class IRFunction<string func> : IRBuilderBase {
+  // Some other function that doesn't use the IRBuilder at all.
+  let prefix = func # "(";
+}
+class CGHelperFn<string func> : IRBuilderBase {
+  // A helper function defined in CGBuiltin.cpp, which takes the IRBuilder as
+  // an argument.
+  let prefix = func # "(Builder, ";
+}
  def add: IRBuilder<"CreateAdd">;
  def or: IRBuilder<"CreateOr">;
  def and: IRBuilder<"CreateAnd">;
@@ -46,12 +64,19 @@ def fsub: IRBuilder<"CreateFSub">;
  def load: IRBuilder<"CreateLoad"> { let address_params = [0]; }
  def store: IRBuilder<"CreateStore"> { let address_params = [1]; }
  def xval: IRBuilder<"CreateExtractValue"> { let int_constant_params = [1]; }
+def trunc: IRBuilder<"CreateTrunc">;
+def extend: CGHelperFn<"SignOrZeroExtend"> { let int_constant_params = [2]; }
+def zeroinit: IRFunction<"llvm::Constant::getNullValue">;
+
+// A node that makes an Address out of a pointer-typed Value, by
+// providing an alignment as the second argument.
+def address;
  
  // Another node class you can use in the codegen dag. This one corresponds to
  // an IR intrinsic function, which has to be specialized to a particular list
  // of types.
-class IRInt<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
-  string intname = name_;       // base name of the intrinsic, minus "arm_mve_"
+class IRIntBase<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
+  string intname = name_;       // base name of the intrinsic
    list<Type> params = params_;  // list of parameter types
  
    // If this flag is set, then the IR intrinsic name will get a suffix _s, _u
@@ -67,6 +92,11 @@ class IRInt<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
    bit appendKind = appendKind_;
  }
  
+// Mostly we'll be using @llvm.arm.mve.* intrinsics, so here's a trivial
+// subclass that puts on that prefix.
+class IRInt<string name, list<Type> params = [], bit appendKind = 0>
+      : IRIntBase<"arm_mve_" # name, params, appendKind>;
+
  // The 'seq' node in a codegen dag specifies a set of IR operations to be
  // performed in order. It has the special ability to define extra variable
  // names, on top of the ones that refer to the intrinsic's parameters. For
@@ -151,6 +181,14 @@ def sint: PrimitiveType<"s", 32> { let nameOverride = "int"; }
  // is.
  class VecOf<Type t>: ComplexType<(CTO_Vec t)>;
  
+// NarrowedVecOf<t,v> expects t to be a scalar type, and v to be a vector
+// type. It returns a vector type whose element type is t, and whose lane
+// count is the same as the lane count of v. (Used as an intermediate value
+// type in the IR representation of a widening load: you load a vector of
+// small things out of memory, and then zext/sext them into a full 128-bit
+// output vector.)
+class NarrowedVecOf<Type t, Type v>: ComplexType<(CTO_Vec t, v)>;
+
  // PredOf expects t to be a scalar, and expands to a predicate vector which
  // (logically speaking) has the same number of lanes as VecOf<t> would.
  class PredOf<Type t>: ComplexType<(CTO_Pred t)>;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp

index 0a16636..516add2 100644 (file)
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6788,6 +6788,13 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
    }
  }
  
+static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
+                                     llvm::Type *T, bool Unsigned) {
+  // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
+  // which finds it convenient to specify signed/unsigned as a boolean flag.
+  return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
+}
+
  Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
                                                const CallExpr *E,
                                                ReturnValueSlot ReturnValue,
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c

new file mode 100644 (file)

index 0000000..5cbf6a3
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
@@ -0,0 +1,1325 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vld1q_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
+float16x8_t test_vld1q_f16(const float16_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_f16(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+//
+float32x4_t test_vld1q_f32(const float32_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_f32(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+int8x16_t test_vld1q_s8(const int8_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_s8(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vld1q_s16(const int16_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_s16(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t test_vld1q_s32(const int32_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_s32(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vld1q_u8(const uint8_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_u8(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+uint16x8_t test_vld1q_u16(const uint16_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_u16(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vld1q_u32(const uint32_t *base)
+{
+#ifdef POLYMORPHIC
+    return vld1q(base);
+#else /* POLYMORPHIC */
+    return vld1q_u32(base);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
+float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_f16(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_f32(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_s8(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_s16(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_s32(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_u8(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_u16(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vld1q_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vld1q_z(base, p);
+#else /* POLYMORPHIC */
+    return vld1q_z_u32(base, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vldrbq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+int8x16_t test_vldrbq_s8(const int8_t *base)
+{
+    return vldrbq_s8(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vldrbq_s16(const int8_t *base)
+{
+    return vldrbq_s16(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vldrbq_s32(const int8_t *base)
+{
+    return vldrbq_s32(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vldrbq_u8(const uint8_t *base)
+{
+    return vldrbq_u8(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vldrbq_u16(const uint8_t *base)
+{
+    return vldrbq_u16(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vldrbq_u32(const uint8_t *base)
+{
+    return vldrbq_u32(base);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_s8(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP0]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+//
+int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_s16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP0]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_s32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_u8(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP0]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+//
+uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_u16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrbq_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP0]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p)
+{
+    return vldrbq_z_u32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
+float16x8_t test_vldrhq_f16(const float16_t *base)
+{
+    return vldrhq_f16(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vldrhq_s16(const int16_t *base)
+{
+    return vldrhq_s16(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vldrhq_s32(const int16_t *base)
+{
+    return vldrhq_s32(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+uint16x8_t test_vldrhq_u16(const uint16_t *base)
+{
+    return vldrhq_u16(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vldrhq_u32(const uint16_t *base)
+{
+    return vldrhq_u32(base);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
+float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_f16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_s16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP0]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_s32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_u16(base, p);
+}
+
+// CHECK-LABEL: @test_vldrhq_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP0]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p)
+{
+    return vldrhq_z_u32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrwq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+//
+float32x4_t test_vldrwq_f32(const float32_t *base)
+{
+    return vldrwq_f32(base);
+}
+
+// CHECK-LABEL: @test_vldrwq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t test_vldrwq_s32(const int32_t *base)
+{
+    return vldrwq_s32(base);
+}
+
+// CHECK-LABEL: @test_vldrwq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vldrwq_u32(const uint32_t *base)
+{
+    return vldrwq_u32(base);
+}
+
+// CHECK-LABEL: @test_vldrwq_z_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p)
+{
+    return vldrwq_z_f32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrwq_z_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p)
+{
+    return vldrwq_z_s32(base, p);
+}
+
+// CHECK-LABEL: @test_vldrwq_z_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
+uint32x4_t test_vldrwq_z_u32(const uint32_t *base, mve_pred16_t p)
+{
+    return vldrwq_z_u32(base, p);
+}
+
+// CHECK-LABEL: @test_vst1q_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    store <8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f16(float16_t *base, float16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_f16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    store <4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f32(float32_t *base, float32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_f32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s8(int8_t *base, int8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_s8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s16(int16_t *base, int16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_s16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s32(int32_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u8(uint8_t *base, uint8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_u8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u16(uint16_t *base, uint16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_u16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u32(uint32_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vst1q(base, value);
+#else /* POLYMORPHIC */
+    vst1q_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_f16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_f32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_s8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_s16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_u8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_u16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vst1q_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vst1q_p(base, value, p);
+#else /* POLYMORPHIC */
+    vst1q_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_s8(int8_t *base, int8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_s8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    store <8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_s16(int8_t *base, int16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_s16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_s32(int8_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_u8(uint8_t *base, uint8x16_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_u8(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    store <8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_u16(uint8_t *base, uint16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_u16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_u32(uint8_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrbq(base, value);
+#else /* POLYMORPHIC */
+    vstrbq_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_s8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_s16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_u8(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_u16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrbq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrbq_p_u32(uint8_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrbq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrbq_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    store <8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_f16(float16_t *base, float16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_f16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_s16(int16_t *base, int16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_s16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_s32(int16_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_u16(uint16_t *base, uint16x8_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_u16(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_u32(uint16_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrhq(base, value);
+#else /* POLYMORPHIC */
+    vstrhq_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_f16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_s16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_u16(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrhq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrhq_p_u32(uint16_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrhq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrhq_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    store <4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_f32(float32_t *base, float32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrwq(base, value);
+#else /* POLYMORPHIC */
+    vstrwq_f32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_s32(int32_t *base, int32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrwq(base, value);
+#else /* POLYMORPHIC */
+    vstrwq_s32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_u32(uint32_t *base, uint32x4_t value)
+{
+#ifdef POLYMORPHIC
+    vstrwq(base, value);
+#else /* POLYMORPHIC */
+    vstrwq_u32(base, value);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrwq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrwq_p_f32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrwq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrwq_p_s32(base, value, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vstrwq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>*
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_vstrwq_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    vstrwq_p(base, value, p);
+#else /* POLYMORPHIC */
+    vstrwq_p_u32(base, value, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp

index aa3b475..2941cb6 100644 (file)
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -283,12 +283,9 @@ class VectorType : public CRegularNamedType {
    unsigned Lanes;
  
  public:
-  VectorType(const ScalarType *Element)
-      : CRegularNamedType(TypeKind::Vector), Element(Element) {
-    // MVE has a fixed 128-bit vector size
-    Lanes = 128 / Element->sizeInBits();
-  }
-  unsigned sizeInBits() const override { return 128; }
+  VectorType(const ScalarType *Element, unsigned Lanes)
+      : CRegularNamedType(TypeKind::Vector), Element(Element), Lanes(Lanes) {}
+  unsigned sizeInBits() const override { return Lanes * Element->sizeInBits(); }
    unsigned lanes() const { return Lanes; }
    bool requiresFloat() const override { return Element->requiresFloat(); }
    std::string cNameBase() const override {
@@ -609,24 +606,41 @@ public:
    }
  };
  
+// Result subclass representing a cast between different pointer types.
+class PointerCastResult : public Result {
+public:
+  const PointerType *PtrType;
+  Ptr V;
+  PointerCastResult(const PointerType *PtrType, Ptr V)
+      : PtrType(PtrType), V(V) {}
+  void genCode(raw_ostream &OS,
+               CodeGenParamAllocator &ParamAlloc) const override {
+    OS << "Builder.CreatePointerCast(" << V->asValue() << ", "
+       << ParamAlloc.allocParam("llvm::Type *", PtrType->llvmName()) << ")";
+  }
+  void morePrerequisites(std::vector<Ptr> &output) const override {
+    output.push_back(V);
+  }
+};
+
  // Result subclass representing a call to an IRBuilder method. Each IRBuilder
  // method we want to use will have a Tablegen record giving the method name and
  // describing any important details of how to call it, such as whether a
  // particular argument should be an integer constant instead of an llvm::Value.
  class IRBuilderResult : public Result {
  public:
-  StringRef BuilderMethod;
+  StringRef CallPrefix;
    std::vector<Ptr> Args;
    std::set<unsigned> AddressArgs;
    std::set<unsigned> IntConstantArgs;
-  IRBuilderResult(StringRef BuilderMethod, std::vector<Ptr> Args,
+  IRBuilderResult(StringRef CallPrefix, std::vector<Ptr> Args,
                    std::set<unsigned> AddressArgs,
                    std::set<unsigned> IntConstantArgs)
-      : BuilderMethod(BuilderMethod), Args(Args), AddressArgs(AddressArgs),
+    : CallPrefix(CallPrefix), Args(Args), AddressArgs(AddressArgs),
          IntConstantArgs(IntConstantArgs) {}
    void genCode(raw_ostream &OS,
                 CodeGenParamAllocator &ParamAlloc) const override {
-    OS << "Builder." << BuilderMethod << "(";
+    OS << CallPrefix;
      const char *Sep = "";
      for (unsigned i = 0, e = Args.size(); i < e; ++i) {
        Ptr Arg = Args[i];
@@ -652,6 +666,25 @@ public:
    }
  };
  
+// Result subclass representing making an Address out of a Value.
+class AddressResult : public Result {
+public:
+  Ptr Arg;
+  unsigned Align;
+  AddressResult(Ptr Arg, unsigned Align) : Arg(Arg), Align(Align) {}
+  void genCode(raw_ostream &OS,
+               CodeGenParamAllocator &ParamAlloc) const override {
+    OS << "Address(" << Arg->varname() << ", CharUnits::fromQuantity("
+       << Align << "))";
+  }
+  std::string typeName() const override {
+    return "Address";
+  }
+  void morePrerequisites(std::vector<Ptr> &output) const override {
+    output.push_back(Arg);
+  }
+};
+
  // Result subclass representing a call to an IR intrinsic, which we first have
  // to look up using an Intrinsic::ID constant and an array of types.
  class IRIntrinsicResult : public Result {
@@ -665,7 +698,7 @@ public:
    void genCode(raw_ostream &OS,
                 CodeGenParamAllocator &ParamAlloc) const override {
      std::string IntNo = ParamAlloc.allocParam(
-        "Intrinsic::ID", "Intrinsic::arm_mve_" + IntrinsicID);
+        "Intrinsic::ID", "Intrinsic::" + IntrinsicID);
      OS << "Builder.CreateCall(CGM.getIntrinsic(" << IntNo;
      if (!ParamTypes.empty()) {
        OS << ", llvm::SmallVector<llvm::Type *, " << ParamTypes.size() << "> {";
@@ -689,6 +722,20 @@ public:
    }
  };
  
+// Result subclass that specifies a type, for use in IRBuilder operations such
+// as CreateBitCast that take a type argument.
+class TypeResult : public Result {
+public:
+  const Type *T;
+  TypeResult(const Type *T) : T(T) {}
+  void genCode(raw_ostream &OS, CodeGenParamAllocator &) const override {
+    OS << T->llvmName();
+  }
+  std::string typeName() const override {
+    return "llvm::Type *";
+  }
+};
+
  // -----------------------------------------------------------------------------
  // Class that describes a single ACLE intrinsic.
  //
@@ -852,7 +899,8 @@ class MveEmitter {
    // MveEmitter holds a collection of all the types we've instantiated.
    VoidType Void;
    std::map<std::string, std::unique_ptr<ScalarType>> ScalarTypes;
-  std::map<std::pair<ScalarTypeKind, unsigned>, std::unique_ptr<VectorType>>
+  std::map<std::tuple<ScalarTypeKind, unsigned, unsigned>,
+           std::unique_ptr<VectorType>>
        VectorTypes;
    std::map<std::pair<std::string, unsigned>, std::unique_ptr<MultiVectorType>>
        MultiVectorTypes;
@@ -872,12 +920,16 @@ public:
    const ScalarType *getScalarType(Record *R) {
      return getScalarType(R->getName());
    }
-  const VectorType *getVectorType(const ScalarType *ST) {
-    std::pair<ScalarTypeKind, unsigned> key(ST->kind(), ST->sizeInBits());
+  const VectorType *getVectorType(const ScalarType *ST, unsigned Lanes) {
+    std::tuple<ScalarTypeKind, unsigned, unsigned> key(ST->kind(),
+                                                       ST->sizeInBits(), Lanes);
      if (VectorTypes.find(key) == VectorTypes.end())
-      VectorTypes[key] = std::make_unique<VectorType>(ST);
+      VectorTypes[key] = std::make_unique<VectorType>(ST, Lanes);
      return VectorTypes[key].get();
    }
+  const VectorType *getVectorType(const ScalarType *ST) {
+    return getVectorType(ST, 128 / ST->sizeInBits());
+  }
    const MultiVectorType *getMultiVectorType(unsigned Registers,
                                              const VectorType *VT) {
      std::pair<std::string, unsigned> key(VT->cNameBase(), Registers);
@@ -969,7 +1021,13 @@ const Type *MveEmitter::getType(DagInit *D, const Type *Param) {
  
    if (Op->getName() == "CTO_Vec") {
      const Type *Element = getType(D->getArg(0), Param);
-    return getVectorType(cast<ScalarType>(Element));
+    if (D->getNumArgs() == 1) {
+      return getVectorType(cast<ScalarType>(Element));
+    } else {
+      const Type *ExistingVector = getType(D->getArg(1), Param);
+      return getVectorType(cast<ScalarType>(Element),
+                           cast<VectorType>(ExistingVector)->lanes());
+    }
    }
  
    if (Op->getName() == "CTO_Pred") {
@@ -1035,8 +1093,21 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
          else
            return std::make_shared<IntCastResult>(ST, Arg);
        }
+    } else if (const auto *PT = dyn_cast<PointerType>(CastType)) {
+      return std::make_shared<PointerCastResult>(PT, Arg);
      }
      PrintFatalError("Unsupported type cast");
+  } else if (Op->getName() == "address") {
+    if (D->getNumArgs() != 2)
+      PrintFatalError("'address' should have two arguments");
+    Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param);
+    unsigned Alignment;
+    if (auto *II = dyn_cast<IntInit>(D->getArg(1))) {
+      Alignment = II->getValue();
+    } else {
+      PrintFatalError("'address' alignment argument should be an integer");
+    }
+    return std::make_shared<AddressResult>(Arg, Alignment);
    } else if (Op->getName() == "unsignedflag") {
      if (D->getNumArgs() != 1)
        PrintFatalError("unsignedflag should have exactly one argument");
@@ -1053,7 +1124,7 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
      std::vector<Result::Ptr> Args;
      for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
        Args.push_back(getCodeForDagArg(D, i, Scope, Param));
-    if (Op->isSubClassOf("IRBuilder")) {
+    if (Op->isSubClassOf("IRBuilderBase")) {
        std::set<unsigned> AddressArgs;
        for (unsigned i : Op->getValueAsListOfInts("address_params"))
          AddressArgs.insert(i);
@@ -1061,8 +1132,8 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
        for (unsigned i : Op->getValueAsListOfInts("int_constant_params"))
          IntConstantArgs.insert(i);
        return std::make_shared<IRBuilderResult>(
-          Op->getValueAsString("func"), Args, AddressArgs, IntConstantArgs);
-    } else if (Op->isSubClassOf("IRInt")) {
+          Op->getValueAsString("prefix"), Args, AddressArgs, IntConstantArgs);
+    } else if (Op->isSubClassOf("IRIntBase")) {
        std::vector<const Type *> ParamTypes;
        for (Record *RParam : Op->getValueAsListOfDefs("params"))
          ParamTypes.push_back(getType(RParam, Param));
@@ -1099,6 +1170,14 @@ Result::Ptr MveEmitter::getCodeForDagArg(DagInit *D, unsigned ArgNum,
    if (auto *DI = dyn_cast<DagInit>(Arg))
      return getCodeForDag(DI, Scope, Param);
  
+  if (auto *DI = dyn_cast<DefInit>(Arg)) {
+    Record *Rec = DI->getDef();
+    if (Rec->isSubClassOf("Type")) {
+      const Type *T = getType(Rec, Param);
+      return std::make_shared<TypeResult>(T);
+    }
+  }
+
    PrintFatalError("bad dag argument type for code generation");
  }
  
@@ -1111,8 +1190,9 @@ Result::Ptr MveEmitter::getCodeForArg(unsigned ArgNum, const Type *ArgType) {
        V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
    } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
      V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-    V = std::make_shared<IRIntrinsicResult>(
-        "pred_i2v", std::vector<const Type *>{PT}, std::vector<Result::Ptr>{V});
+    V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
+                                            std::vector<const Type *>{PT},
+                                            std::vector<Result::Ptr>{V});
    }
  
    return V;
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll

new file mode 100644 (file)

index 0000000..9780b08
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll
@@ -0,0 +1,1208 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -enable-arm-maskedldst -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x half> @test_vld1q_f16(half* %base) {
+; CHECK-LABEL: test_vld1q_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  ret <8 x half> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vld1q_f32(float* %base) {
+; CHECK-LABEL: test_vld1q_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  ret <4 x float> %1
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_s8(i8* %base) {
+; CHECK-LABEL: test_vld1q_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_s16(i16* %base) {
+; CHECK-LABEL: test_vld1q_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_s32(i32* %base) {
+; CHECK-LABEL: test_vld1q_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_u8(i8* %base) {
+; CHECK-LABEL: test_vld1q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_u16(i16* %base) {
+; CHECK-LABEL: test_vld1q_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_u32(i32* %base) {
+; CHECK-LABEL: test_vld1q_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vld1q_z_f16(half* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %2, <8 x half> zeroinitializer)
+  ret <8 x half> %3
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
+
+define arm_aapcs_vfpcc <4 x float> @test_vld1q_z_f32(float* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %2, <4 x float> zeroinitializer)
+  ret <4 x float> %3
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_z_s8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_z_s16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_z_s32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vld1q_z_u8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vld1q_z_u16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vld1q_z_u32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vld1q_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_s8(i8* %base) {
+; CHECK-LABEL: test_vldrbq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_s16(i8* %base) {
+; CHECK-LABEL: test_vldrbq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.s16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_s32(i8* %base) {
+; CHECK-LABEL: test_vldrbq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_u8(i8* %base) {
+; CHECK-LABEL: test_vldrbq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_u16(i8* %base) {
+; CHECK-LABEL: test_vldrbq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_u32(i8* %base) {
+; CHECK-LABEL: test_vldrbq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_z_s8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_z_s16(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.s16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %2, <8 x i8> zeroinitializer)
+  %4 = sext <8 x i8> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_z_s32(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %2, <4 x i8> zeroinitializer)
+  %4 = sext <4 x i8> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_z_u8(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer)
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_z_u16(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <8 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %2, <8 x i8> zeroinitializer)
+  %4 = zext <8 x i8> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_z_u32(i8* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrbq_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <4 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %2, <4 x i8> zeroinitializer)
+  %4 = zext <4 x i8> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vldrhq_f16(half* %base) {
+; CHECK-LABEL: test_vldrhq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  ret <8 x half> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_s16(i16* %base) {
+; CHECK-LABEL: test_vldrhq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_s32(i16* %base) {
+; CHECK-LABEL: test_vldrhq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_u16(i16* %base) {
+; CHECK-LABEL: test_vldrhq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_u32(i16* %base) {
+; CHECK-LABEL: test_vldrhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vldrhq_z_f16(half* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %2, <8 x half> zeroinitializer)
+  ret <8 x half> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_z_s16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_z_s32(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %2, <4 x i16> zeroinitializer)
+  %4 = sext <4 x i16> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
+
+define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_z_u16(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer)
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_z_u32(i16* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrhq_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <4 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %2, <4 x i16> zeroinitializer)
+  %4 = zext <4 x i16> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vldrwq_f32(float* %base) {
+; CHECK-LABEL: test_vldrwq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  ret <4 x float> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_s32(i32* %base) {
+; CHECK-LABEL: test_vldrwq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_u32(i32* %base) {
+; CHECK-LABEL: test_vldrwq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vldrwq_z_f32(float* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrwq_z_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %2, <4 x float> zeroinitializer)
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_z_s32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrwq_z_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_z_u32(i32* %base, i16 zeroext %p) {
+; CHECK-LABEL: test_vldrwq_z_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_f16(half* %base, <8 x half> %value) {
+; CHECK-LABEL: test_vst1q_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  store <8 x half> %value, <8 x half>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_f32(float* %base, <4 x float> %value) {
+; CHECK-LABEL: test_vst1q_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  store <4 x float> %value, <4 x float>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_s8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vst1q_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_s16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vst1q_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_s32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vst1q_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_u8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vst1q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_u16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vst1q_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_u32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vst1q_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_p_f16(half* %base, <8 x half> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %value, <8 x half>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_f32(float* %base, <4 x float> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %value, <4 x float>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_s8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_s16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_s32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vst1q_p_u8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_p_u16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1q_p_u32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vst1q_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_s8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vstrbq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_s16(i8* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrbq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  store <8 x i8> %0, <8 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_s32(i8* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrbq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  store <4 x i8> %0, <4 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_u8(i8* %base, <16 x i8> %value) {
+; CHECK-LABEL: test_vstrbq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  store <16 x i8> %value, <16 x i8>* %0, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_u16(i8* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrbq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  store <8 x i8> %0, <8 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_u32(i8* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrbq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrb.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  store <4 x i8> %0, <4 x i8>* %1, align 1
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_s8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_s16(i8* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %0, <8 x i8>* %1, i32 1, <8 x i1> %3)
+  ret void
+}
+
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_s32(i8* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %0, <4 x i8>* %1, i32 1, <4 x i1> %3)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_u8(i8* %base, <16 x i8> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i8* %base to <16 x i8>*
+  %1 = zext i16 %p to i32
+  %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_u16(i8* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <8 x i16> %value to <8 x i8>
+  %1 = bitcast i8* %base to <8 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %0, <8 x i8>* %1, i32 1, <8 x i1> %3)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrbq_p_u32(i8* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrbq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i8>
+  %1 = bitcast i8* %base to <4 x i8>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %0, <4 x i8>* %1, i32 1, <4 x i1> %3)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_f16(half* %base, <8 x half> %value) {
+; CHECK-LABEL: test_vstrhq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  store <8 x half> %value, <8 x half>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_s16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrhq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_s32(i16* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrhq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  store <4 x i16> %0, <4 x i16>* %1, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_u16(i16* %base, <8 x i16> %value) {
+; CHECK-LABEL: test_vstrhq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  store <8 x i16> %value, <8 x i16>* %0, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_u32(i16* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrh.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  store <4 x i16> %0, <4 x i16>* %1, align 2
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_f16(half* %base, <8 x half> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast half* %base to <8 x half>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %value, <8 x half>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_s16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_s32(i16* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %0, <4 x i16>* %1, i32 2, <4 x i1> %3)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_u16(i16* %base, <8 x i16> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i16* %base to <8 x i16>*
+  %1 = zext i16 %p to i32
+  %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrhq_p_u32(i16* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrhq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = trunc <4 x i32> %value to <4 x i16>
+  %1 = bitcast i16* %base to <4 x i16>*
+  %2 = zext i16 %p to i32
+  %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %0, <4 x i16>* %1, i32 2, <4 x i1> %3)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_f32(float* %base, <4 x float> %value) {
+; CHECK-LABEL: test_vstrwq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  store <4 x float> %value, <4 x float>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_s32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrwq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_u32(i32* %base, <4 x i32> %value) {
+; CHECK-LABEL: test_vstrwq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  store <4 x i32> %value, <4 x i32>* %0, align 4
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_p_f32(float* %base, <4 x float> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrwq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float* %base to <4 x float>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %value, <4 x float>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_p_s32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrwq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vstrwq_p_u32(i32* %base, <4 x i32> %value, i16 zeroext %p) {
+; CHECK-LABEL: test_vstrwq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast i32* %base to <4 x i32>*
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2)
+  ret void
+}
author	Simon Tatham <simon.tatham@arm.com>
	Mon, 11 Nov 2019 17:02:39 +0000 (17:02 +0000)
committer	Simon Tatham <simon.tatham@arm.com>
	Wed, 13 Nov 2019 12:47:00 +0000 (12:47 +0000)
clang/include/clang/Basic/arm_mve.td		patch \| blob \| history
clang/include/clang/Basic/arm_mve_defs.td		patch \| blob \| history
clang/lib/CodeGen/CGBuiltin.cpp		patch \| blob \| history
clang/test/CodeGen/arm-mve-intrinsics/load-store.c	[new file with mode: 0644]	patch \| blob
clang/utils/TableGen/MveEmitter.cpp		patch \| blob \| history
llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll	[new file with mode: 0644]	patch \| blob