From a0e3ceea6ce909067717bb703f4aaf84d88a3bbb Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 10 Jun 2020 09:15:13 +0100
Subject: [PATCH] [AArch64][SVE] Change pointer type of struct load/store
 intrinsics.

Instead of loading from e.g. `<vscale x 16 x i8>*`, load from element
pointer `i8*`. This is more in line with the other load/store
intrinsics for SVE.

Reviewers: fpetrogalli, c-rhodes, rengolin, efriedma

Reviewed By: efriedma

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D81458
---
 llvm/include/llvm/IR/IntrinsicsAArch64.td          |   8 +-
 llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll  | 151 +++++++--------
 .../sve-intrinsics-stN-reg-imm-addr-mode.ll        | 208 ++++++++++-----------
 .../sve-intrinsics-stN-reg-reg-addr-mode.ll        | 109 +++++------
 llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll | 130 ++++++-------
 5 files changed, 282 insertions(+), 324 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index d6755a2..7feded9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -815,7 +815,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
 
   class AdvSIMD_ManyVec_PredLoad_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty],
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMPointerToElt<0>],
                 [IntrReadMem, IntrArgMemOnly]>;
 
   class AdvSIMD_1Vec_PredLoad_Intrinsic
@@ -834,20 +834,20 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Vec_PredStore_Intrinsic
       : Intrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>,
-                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerTo<0>],
+                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerToElt<0>],
                   [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
 
   class AdvSIMD_3Vec_PredStore_Intrinsic
       : Intrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
-                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerTo<0>],
+                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerToElt<0>],
                   [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
 
   class AdvSIMD_4Vec_PredStore_Intrinsic
       : Intrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
                    LLVMMatchType<0>,
-                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerTo<0>],
+                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerToElt<0>],
                   [IntrArgMemOnly, NoCapture<ArgIndex<5>>]>;
 
   class AdvSIMD_SVE_Index_Intrinsic
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
index 1244782..838e93d 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -256,12 +256,11 @@ define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, double* %addr)
 ; LD2B
 ;
 
-define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld2b_i8:
 ; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
-                                                                                 <vscale x 16 x i8>* %addr)
+  %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %pred, i8* %addr)
   ret <vscale x 32 x i8> %res
 }
 
@@ -269,21 +268,19 @@ define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>*
 ; LD2H
 ;
 
-define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: ld2h_i16:
 ; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
-                                                                                  <vscale x 8 x i16>* %addr)
+  %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1> %pred, i16* %addr)
   ret <vscale x 16 x i16> %res
 }
 
-define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, half* %addr) {
 ; CHECK-LABEL: ld2h_f16:
 ; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
-                                                                                   <vscale x 8 x half>* %addr)
+  %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1> %pred, half* %addr)
   ret <vscale x 16 x half> %res
 }
 
@@ -291,21 +288,19 @@ define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x hal
 ; LD2W
 ;
 
-define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, i32* %addr) {
 ; CHECK-LABEL: ld2w_i32:
 ; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
-                                                                                <vscale x 4 x i32>* %addr)
+  %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1> %pred, i32* %addr)
   ret <vscale x 8 x i32> %res
 }
 
-define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, float* %addr) {
 ; CHECK-LABEL: ld2w_f32:
 ; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
-                                                                                  <vscale x 4 x float>* %addr)
+  %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1> %pred, float* %addr)
   ret <vscale x 8 x float> %res
 }
 
@@ -313,21 +308,19 @@ define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x flo
 ; LD2D
 ;
 
-define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, i64* %addr) {
 ; CHECK-LABEL: ld2d_i64:
 ; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
-                                                                                <vscale x 2 x i64>* %addr)
+  %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1> %pred, i64* %addr)
   ret <vscale x 4 x i64> %res
 }
 
-define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, double* %addr) {
 ; CHECK-LABEL: ld2d_f64:
 ; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
-                                                                                   <vscale x 2 x double>* %addr)
+  %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1> %pred, double* %addr)
   ret <vscale x 4 x double> %res
 }
 
@@ -335,12 +328,11 @@ define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x do
 ; LD3B
 ;
 
-define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld3b_i8:
 ; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
-                                                                                 <vscale x 16 x i8>* %addr)
+  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %pred, i8* %addr)
   ret <vscale x 48 x i8> %res
 }
 
@@ -348,21 +340,19 @@ define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>*
 ; LD3H
 ;
 
-define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: ld3h_i16:
 ; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
-                                                                                  <vscale x 8 x i16>* %addr)
+  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1> %pred, i16* %addr)
   ret <vscale x 24 x i16> %res
 }
 
-define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, half* %addr) {
 ; CHECK-LABEL: ld3h_f16:
 ; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
-                                                                                   <vscale x 8 x half>* %addr)
+  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1> %pred, half* %addr)
   ret <vscale x 24 x half> %res
 }
 
@@ -370,21 +360,19 @@ define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x hal
 ; LD3W
 ;
 
-define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, i32* %addr) {
 ; CHECK-LABEL: ld3w_i32:
 ; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
-                                                                                  <vscale x 4 x i32>* %addr)
+  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1> %pred, i32* %addr)
   ret <vscale x 12 x i32> %res
 }
 
-define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, float* %addr) {
 ; CHECK-LABEL: ld3w_f32:
 ; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
-                                                                                    <vscale x 4 x float>* %addr)
+  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1> %pred, float* %addr)
   ret <vscale x 12 x float> %res
 }
 
@@ -392,21 +380,19 @@ define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x fl
 ; LD3D
 ;
 
-define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, i64* %addr) {
 ; CHECK-LABEL: ld3d_i64:
 ; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
-                                                                                <vscale x 2 x i64>* %addr)
+  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1> %pred, i64* %addr)
   ret <vscale x 6 x i64> %res
 }
 
-define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, double* %addr) {
 ; CHECK-LABEL: ld3d_f64:
 ; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
-                                                                                   <vscale x 2 x double>* %addr)
+  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1> %pred, double* %addr)
   ret <vscale x 6 x double> %res
 }
 
@@ -414,12 +400,11 @@ define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x do
 ; LD4B
 ;
 
-define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: ld4b_i8:
 ; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
-                                                                                 <vscale x 16 x i8>* %addr)
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %pred, i8* %addr)
   ret <vscale x 64 x i8> %res
 }
 
@@ -427,21 +412,19 @@ define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>*
 ; LD4H
 ;
 
-define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: ld4h_i16:
 ; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
-                                                                                  <vscale x 8 x i16>* %addr)
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1> %pred, i16* %addr)
   ret <vscale x 32 x i16> %res
 }
 
-define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, half* %addr) {
 ; CHECK-LABEL: ld4h_f16:
 ; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
-                                                                                   <vscale x 8 x half>* %addr)
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1> %pred, half* %addr)
   ret <vscale x 32 x half> %res
 }
 
@@ -449,21 +432,19 @@ define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x hal
 ; LD4W
 ;
 
-define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, i32* %addr) {
 ; CHECK-LABEL: ld4w_i32:
 ; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
-                                                                                  <vscale x 4 x i32>* %addr)
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1> %pred, i32* %addr)
   ret <vscale x 16 x i32> %res
 }
 
-define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, float* %addr) {
 ; CHECK-LABEL: ld4w_f32:
 ; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
-                                                                                    <vscale x 4 x float>* %addr)
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1> %pred, float* %addr)
   ret <vscale x 16 x float> %res
 }
 
@@ -471,21 +452,19 @@ define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x fl
 ; LD4D
 ;
 
-define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, i64* %addr) {
 ; CHECK-LABEL: ld4d_i64:
 ; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
-                                                                                <vscale x 2 x i64>* %addr)
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1> %pred, i64* %addr)
   ret <vscale x 8 x i64> %res
 }
 
-define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, double* %addr) {
 ; CHECK-LABEL: ld4d_f64:
 ; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
 ; CHECK-NEXT: ret
-  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
-                                                                                   <vscale x 2 x double>* %addr)
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1> %pred, double* %addr)
   ret <vscale x 8 x double> %res
 }
 
@@ -506,26 +485,26 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, h
 declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*)
 declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*)
 
-declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
-declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
-declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
-declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
-declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
-declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
-declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
index 8ef27dc..a5e278c 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
@@ -14,11 +14,11 @@ define void @st2b_i8_valid_imm(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <
 ; CHECK-LABEL: st2b_i8_valid_imm:
 ; CHECK: st2b { z0.b, z1.b }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 2
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 2, i64 0
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -27,11 +27,11 @@ define void @st2b_i8_invalid_imm_not_multiple_of_2(<vscale x 16 x i8> %v0, <vsca
 ; CHECK: rdvl x[[N:[0-9]+]], #3
 ; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3, i64 0
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -40,11 +40,11 @@ define void @st2b_i8_invalid_imm_out_of_lower_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK: rdvl x[[N:[0-9]+]], #-18
 ; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -18
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -18, i64 0
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -53,11 +53,11 @@ define void @st2b_i8_invalid_imm_out_of_upper_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK: rdvl x[[N:[0-9]+]], #16
 ; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 16
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 16, i64 0
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -65,11 +65,11 @@ define void @st2b_i8_valid_imm_lower_bound(<vscale x 16 x i8> %v0, <vscale x 16
 ; CHECK-LABEL: st2b_i8_valid_imm_lower_bound:
 ; CHECK: st2b { z0.b, z1.b }, p0, [x0, #-16, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16, i64 0
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -77,11 +77,11 @@ define void @st2b_i8_valid_imm_upper_bound(<vscale x 16 x i8> %v0, <vscale x 16
 ; CHECK-LABEL: st2b_i8_valid_imm_upper_bound:
 ; CHECK: st2b { z0.b, z1.b }, p0, [x0, #14, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14, i64 0
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -93,11 +93,11 @@ define void @st2h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
 ; CHECK-LABEL: st2h_i16:
 ; CHECK: st2h { z0.h, z1.h }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 2
+  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 2, i64 0
   call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %base)
+                                          i16* %base)
   ret void
 }
 
@@ -105,11 +105,11 @@ define void @st2h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; CHECK-LABEL: st2h_f16:
 ; CHECK: st2h { z0.h, z1.h }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 2
+  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 2, i64 0
   call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %base)
+                                          half* %base)
   ret void
 }
 
@@ -121,11 +121,11 @@ define void @st2w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
 ; CHECK-LABEL: st2w_i32:
 ; CHECK: st2w { z0.s, z1.s }, p0, [x0, #4, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 4
+  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 4, i64 0
   call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %base)
+                                          i32* %base)
   ret void
 }
 
@@ -133,11 +133,11 @@ define void @st2w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; CHECK-LABEL: st2w_f32:
 ; CHECK: st2w { z0.s, z1.s }, p0, [x0, #6, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 6
+  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 6, i64 0
   call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %base)
+                                          float* %base)
   ret void
 }
 
@@ -149,11 +149,11 @@ define void @st2d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
 ; CHECK-LABEL: st2d_i64:
 ; CHECK: st2d { z0.d, z1.d }, p0, [x0, #8, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 8
+  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 8, i64 0
   call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %base)
+                                          i64* %base)
   ret void
 }
 
@@ -161,11 +161,11 @@ define void @st2d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; CHECK-LABEL: st2d_f64:
 ; CHECK: st2d { z0.d, z1.d }, p0, [x0, #10, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 10
+  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 10, i64 0
   call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %base)
+                                          double* %base)
   ret void
 }
 
@@ -177,12 +177,12 @@ define void @st3b_i8_valid_imm(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <
 ; CHECK-LABEL: st3b_i8_valid_imm:
 ; CHECK: st3b { z0.b, z1.b, z2.b }, p0, [x0, #3, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -191,12 +191,12 @@ define void @st3b_i8_invalid_imm_not_multiple_of_3_01(<vscale x 16 x i8> %v0, <v
 ; CHECK: rdvl x[[N:[0-9]+]], #4
 ; CHECK-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -205,12 +205,12 @@ define void @st3b_i8_invalid_imm_not_multiple_of_3_02(<vscale x 16 x i8> %v0, <v
 ; CHECK: rdvl x[[N:[0-9]+]], #5
 ; CHECK-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -219,12 +219,12 @@ define void @st3b_i8_invalid_imm_out_of_lower_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK: rdvl x[[N:[0-9]+]], #-27
 ; CHECK-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -27
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -27, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -233,12 +233,12 @@ define void @st3b_i8_invalid_imm_out_of_upper_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK: rdvl x[[N:[0-9]+]], #24
 ; CHECK-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 24
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 24, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -246,12 +246,12 @@ define void @st3b_i8_valid_imm_lower_bound(<vscale x 16 x i8> %v0, <vscale x 16
 ; CHECK-LABEL: st3b_i8_valid_imm_lower_bound:
 ; CHECK: st3b { z0.b, z1.b, z2.b }, p0, [x0, #-24, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -259,12 +259,12 @@ define void @st3b_i8_valid_imm_upper_bound(<vscale x 16 x i8> %v0, <vscale x 16
 ; CHECK-LABEL: st3b_i8_valid_imm_upper_bound:
 ; CHECK: st3b { z0.b, z1.b, z2.b }, p0, [x0, #21, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21, i64 0
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -276,12 +276,12 @@ define void @st3h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
 ; CHECK-LABEL: st3h_i16:
 ; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0, #6, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 6
+  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 6, i64 0
   call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %base)
+                                          i16* %base)
   ret void
 }
 
@@ -289,12 +289,12 @@ define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; CHECK-LABEL: st3h_f16:
 ; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0, #9, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 9
+  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 9, i64 0
   call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %base)
+                                          half* %base)
   ret void
 }
 
@@ -306,12 +306,12 @@ define void @st3w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
 ; CHECK-LABEL: st3w_i32:
 ; CHECK: st3w { z0.s, z1.s, z2.s }, p0, [x0, #12, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 12
+  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 12, i64 0
   call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %base)
+                                          i32* %base)
   ret void
 }
 
@@ -319,12 +319,12 @@ define void @st3w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; CHECK-LABEL: st3w_f32:
 ; CHECK: st3w { z0.s, z1.s, z2.s }, p0, [x0, #15, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 15
+  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 15, i64 0
   call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %base)
+                                          float* %base)
   ret void
 }
 
@@ -336,12 +336,12 @@ define void @st3d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
 ; CHECK-LABEL: st3d_i64:
 ; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0, #18, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 18
+  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 18, i64 0
   call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %base)
+                                          i64* %base)
   ret void
 }
 
@@ -349,12 +349,12 @@ define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; CHECK-LABEL: st3d_f64:
 ; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0, #-3, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -3
+  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -3, i64 0
   call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %base)
+                                          double* %base)
   ret void
 }
 
@@ -366,13 +366,13 @@ define void @st4b_i8_valid_imm(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <
 ; CHECK-LABEL: st4b_i8_valid_imm:
 ; CHECK: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, #4, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -381,13 +381,13 @@ define void @st4b_i8_invalid_imm_not_multiple_of_4_01(<vscale x 16 x i8> %v0, <v
 ; CHECK: rdvl x[[N:[0-9]+]], #5
 ; CHECK-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -396,13 +396,13 @@ define void @st4b_i8_invalid_imm_not_multiple_of_4_02(<vscale x 16 x i8> %v0, <v
 ; CHECK: rdvl x[[N:[0-9]+]], #6
 ; CHECK-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 6
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 6, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -411,13 +411,13 @@ define void @st4b_i8_invalid_imm_not_multiple_of_4_03(<vscale x 16 x i8> %v0, <v
 ; CHECK: rdvl x[[N:[0-9]+]], #7
 ; CHECK-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x[[N]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 7
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 7, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -433,13 +433,13 @@ define void @st4b_i8_invalid_imm_out_of_lower_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK-DAG:  mul  x[[OFFSET:[0-9]+]], x[[P]], x[[M]]
 ; CHECK-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x[[OFFSET]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -36
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -36, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -455,13 +455,13 @@ define void @st4b_i8_invalid_imm_out_of_upper_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK-DAG:  mul  x[[OFFSET:[0-9]+]], x[[P]], x[[M]]
 ; CHECK-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x[[OFFSET]]]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 32
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 32, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -469,13 +469,13 @@ define void @st4b_i8_valid_imm_lower_bound(<vscale x 16 x i8> %v0, <vscale x 16
 ; CHECK-LABEL: st4b_i8_valid_imm_lower_bound:
 ; CHECK: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, #-32, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -483,13 +483,13 @@ define void @st4b_i8_valid_imm_upper_bound(<vscale x 16 x i8> %v0, <vscale x 16
 ; CHECK-LABEL: st4b_i8_valid_imm_upper_bound:
 ; CHECK: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, #28, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28
+  %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28, i64 0
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %base)
   ret void
 }
 
@@ -501,13 +501,13 @@ define void @st4h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
 ; CHECK-LABEL: st4h_i16:
 ; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0, #8, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 8
+  %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 8, i64 0
   call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i16> %v3,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %base)
+                                          i16* %base)
   ret void
 }
 
@@ -515,13 +515,13 @@ define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; CHECK-LABEL: st4h_f16:
 ; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0, #12, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 12
+  %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 12, i64 0
   call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x half> %v3,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %base)
+                                          half* %base)
   ret void
 }
 
@@ -533,13 +533,13 @@ define void @st4w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
 ; CHECK-LABEL: st4w_i32:
 ; CHECK: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x0, #16, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 16
+  %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 16, i64 0
   call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i32> %v3,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %base)
+                                          i32* %base)
   ret void
 }
 
@@ -547,13 +547,13 @@ define void @st4w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; CHECK-LABEL: st4w_f32:
 ; CHECK: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x0, #20, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 20
+  %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 20, i64 0
   call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x float> %v3,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %base)
+                                          float* %base)
   ret void
 }
 
@@ -565,13 +565,13 @@ define void @st4d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
 ; CHECK-LABEL: st4d_i64:
 ; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0, #24, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 24
+  %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 24, i64 0
   call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i64> %v3,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %base)
+                                          i64* %base)
   ret void
 }
 
@@ -579,36 +579,36 @@ define void @st4d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; CHECK-LABEL: st4d_f64:
 ; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0, #28, mul vl]
 ; CHECK-NEXT: ret
-  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 28
+  %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 28, i64 0
   call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x double> %v3,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %base)
-  ret void
-}
-
-declare void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
+                                          double* %base)
+  ret void
+}
+
+declare void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+
+declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+
+declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll
index 4945fdc..5956290 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll
@@ -9,11 +9,10 @@ define void @st2b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
 ; CHECK: st2b { z0.b, z1.b }, p0, [x0, x1]
 ; CHECK-NEXT: ret
   %1 = getelementptr i8, i8* %addr, i64 %offset
-  %base = bitcast i8* %1 to <vscale x 16 x i8>*
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %1)
   ret void
 }
 
@@ -26,11 +25,10 @@ define void @st2h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
 ; CHECK: st2h { z0.h, z1.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
   %1 = getelementptr i16, i16* %addr, i64 %offset
-  %base = bitcast i16* %1 to <vscale x 8 x i16>*
   call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %base)
+                                          i16* %1)
   ret void
 }
 
@@ -39,11 +37,10 @@ define void @st2h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; CHECK: st2h { z0.h, z1.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
   %1 = getelementptr half, half* %addr, i64 %offset
-  %base = bitcast half* %1 to <vscale x 8 x half>*
   call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %base)
+                                          half* %1)
   ret void
 }
 
@@ -56,11 +53,10 @@ define void @st2w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
 ; CHECK: st2w { z0.s, z1.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT: ret
   %1 = getelementptr i32, i32* %addr, i64 %offset
-  %base = bitcast i32* %1 to <vscale x 4 x i32>*
   call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %base)
+                                          i32* %1)
   ret void
 }
 
@@ -69,11 +65,10 @@ define void @st2w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; CHECK: st2w { z0.s, z1.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT: ret
   %1 = getelementptr float, float* %addr, i64 %offset
-  %base = bitcast float* %1 to <vscale x 4 x float>*
   call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %base)
+                                          float* %1)
   ret void
 }
 
@@ -86,11 +81,10 @@ define void @st2d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
 ; CHECK: st2d { z0.d, z1.d }, p0, [x0, x1, lsl #3]
 ; CHECK-NEXT: ret
   %1 = getelementptr i64, i64* %addr, i64 %offset
-  %base = bitcast i64* %1 to <vscale x 2 x i64>*
   call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %base)
+                                          i64* %1)
   ret void
 }
 
@@ -99,11 +93,10 @@ define void @st2d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; CHECK: st2d { z0.d, z1.d }, p0, [x0, x1, lsl #3]
 ; CHECK-NEXT: ret
   %1 = getelementptr double, double* %addr, i64 %offset
-  %base = bitcast double* %1 to <vscale x 2 x double>*
   call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %base)
+                                          double* %1)
   ret void
 }
 
@@ -116,12 +109,11 @@ define void @st3b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
 ; CHECK: st3b { z0.b, z1.b, z2.b }, p0, [x0, x1]
 ; CHECK-NEXT: ret
   %1 = getelementptr i8, i8* %addr, i64 %offset
-  %base = bitcast i8* %1 to <vscale x 16 x i8>*
   call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %1)
   ret void
 }
 
@@ -134,12 +126,11 @@ define void @st3h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
 ; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
   %1 = getelementptr i16, i16* %addr, i64 %offset
-  %base = bitcast i16* %1 to <vscale x 8 x i16>*
   call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %base)
+                                          i16* %1)
   ret void
 }
 
@@ -148,12 +139,11 @@ define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
   %1 = getelementptr half, half* %addr, i64 %offset
-  %base = bitcast half* %1 to <vscale x 8 x half>*
   call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %base)
+                                          half* %1)
   ret void
 }
 
@@ -166,12 +156,11 @@ define void @st3w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
 ; CHECK: st3w { z0.s, z1.s, z2.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT: ret
   %1 = getelementptr i32, i32* %addr, i64 %offset
-  %base = bitcast i32* %1 to <vscale x 4 x i32>*
   call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %base)
+                                          i32* %1)
   ret void
 }
 
@@ -180,12 +169,11 @@ define void @st3w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; CHECK: st3w { z0.s, z1.s, z2.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT: ret
   %1 = getelementptr float, float* %addr, i64 %offset
-  %base = bitcast float* %1 to <vscale x 4 x float>*
   call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %base)
+                                          float* %1)
   ret void
 }
 
@@ -198,12 +186,11 @@ define void @st3d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
 ; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0, x1, lsl #3]
 ; CHECK-NEXT: ret
   %1 = getelementptr i64, i64* %addr, i64 %offset
-  %base = bitcast i64* %1 to <vscale x 2 x i64>*
   call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %base)
+                                          i64* %1)
   ret void
 }
 
@@ -212,12 +199,11 @@ define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0, x1, lsl #3]
 ; CHECK-NEXT: ret
   %1 = getelementptr double, double* %addr, i64 %offset
-  %base = bitcast double* %1 to <vscale x 2 x double>*
   call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %base)
+                                          double* %1)
   ret void
 }
 
@@ -230,13 +216,12 @@ define void @st4b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
 ; CHECK: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x1]
 ; CHECK-NEXT: ret
   %1 = getelementptr i8, i8* %addr, i64 %offset
-  %base = bitcast i8* %1 to <vscale x 16 x i8>*
   call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %base)
+                                          i8* %1)
   ret void
 }
 
@@ -249,13 +234,12 @@ define void @st4h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
 ; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
   %1 = getelementptr i16, i16* %addr, i64 %offset
-  %base = bitcast i16* %1 to <vscale x 8 x i16>*
   call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i16> %v3,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %base)
+                                          i16* %1)
   ret void
 }
 
@@ -264,13 +248,12 @@ define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
   %1 = getelementptr half, half* %addr, i64 %offset
-  %base = bitcast half* %1 to <vscale x 8 x half>*
   call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x half> %v3,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %base)
+                                          half* %1)
   ret void
 }
 
@@ -283,13 +266,12 @@ define void @st4w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
 ; CHECK: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT: ret
   %1 = getelementptr i32, i32* %addr, i64 %offset
-  %base = bitcast i32* %1 to <vscale x 4 x i32>*
   call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i32> %v3,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %base)
+                                          i32* %1)
   ret void
 }
 
@@ -298,13 +280,12 @@ define void @st4w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; CHECK: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT: ret
   %1 = getelementptr float, float* %addr, i64 %offset
-  %base = bitcast float* %1 to <vscale x 4 x float>*
   call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x float> %v3,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %base)
+                                          float* %1)
   ret void
 }
 
@@ -317,13 +298,12 @@ define void @st4d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
 ; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0, x1, lsl #3]
 ; CHECK-NEXT: ret
   %1 = getelementptr i64, i64* %addr, i64 %offset
-  %base = bitcast i64* %1 to <vscale x 2 x i64>*
   call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i64> %v3,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %base)
+                                          i64* %1)
   ret void
 }
 
@@ -332,36 +312,35 @@ define void @st4d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0, x1, lsl #3]
 ; CHECK-NEXT: ret
   %1 = getelementptr double, double* %addr, i64 %offset
-  %base = bitcast double* %1 to <vscale x 2 x double>*
   call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x double> %v3,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %base)
+                                          double* %1)
   ret void
 }
 
-declare void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
+declare void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+
+declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+
+declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
index ac2b9a3..6416376 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
@@ -4,14 +4,14 @@
 ; ST2B
 ;
 
-define void @st2b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+define void @st2b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: st2b_i8:
 ; CHECK: st2b { z0.b, z1.b }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %addr)
+                                          i8* %addr)
   ret void
 }
 
@@ -19,25 +19,25 @@ define void @st2b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
 ; ST2H
 ;
 
-define void @st2h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+define void @st2h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: st2h_i16:
 ; CHECK: st2h { z0.h, z1.h }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %addr)
+                                          i16* %addr)
   ret void
 }
 
-define void @st2h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+define void @st2h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x i1> %pred, half* %addr) {
 ; CHECK-LABEL: st2h_f16:
 ; CHECK: st2h { z0.h, z1.h }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %addr)
+                                          half* %addr)
   ret void
 }
 
@@ -45,25 +45,25 @@ define void @st2h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; ST2W
 ;
 
-define void @st2w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+define void @st2w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i1> %pred, i32* %addr) {
 ; CHECK-LABEL: st2w_i32:
 ; CHECK: st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %addr)
+                                          i32* %addr)
   ret void
 }
 
-define void @st2w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+define void @st2w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x i1> %pred, float* %addr) {
 ; CHECK-LABEL: st2w_f32:
 ; CHECK: st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %addr)
+                                          float* %addr)
   ret void
 }
 
@@ -71,25 +71,25 @@ define void @st2w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; ST2D
 ;
 
-define void @st2d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+define void @st2d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i1> %pred, i64* %addr) {
 ; CHECK-LABEL: st2d_i64:
 ; CHECK: st2d { z0.d, z1.d }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %addr)
+                                          i64* %addr)
   ret void
 }
 
-define void @st2d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+define void @st2d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x i1> %pred, double* %addr) {
 ; CHECK-LABEL: st2d_f64:
 ; CHECK: st2d { z0.d, z1.d }, p0, [x0]
 ; CHECK-NEXT: ret
   call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %addr)
+                                          double* %addr)
   ret void
 }
 
@@ -97,7 +97,7 @@ define void @st2d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; ST3B
 ;
 
-define void @st3b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+define void @st3b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: st3b_i8:
 ; CHECK: st3b { z0.b, z1.b, z2.b }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -105,7 +105,7 @@ define void @st3b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %addr)
+                                          i8* %addr)
   ret void
 }
 
@@ -113,7 +113,7 @@ define void @st3b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
 ; ST3H
 ;
 
-define void @st3h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+define void @st3h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: st3h_i16:
 ; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -121,11 +121,11 @@ define void @st3h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %addr)
+                                          i16* %addr)
   ret void
 }
 
-define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x i1> %pred, half* %addr) {
 ; CHECK-LABEL: st3h_f16:
 ; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -133,7 +133,7 @@ define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %addr)
+                                          half* %addr)
   ret void
 }
 
@@ -141,7 +141,7 @@ define void @st3h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; ST3W
 ;
 
-define void @st3w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+define void @st3w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i1> %pred, i32* %addr) {
 ; CHECK-LABEL: st3w_i32:
 ; CHECK: st3w { z0.s, z1.s, z2.s }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -149,11 +149,11 @@ define void @st3w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %addr)
+                                          i32* %addr)
   ret void
 }
 
-define void @st3w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+define void @st3w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x i1> %pred, float* %addr) {
 ; CHECK-LABEL: st3w_f32:
 ; CHECK: st3w { z0.s, z1.s, z2.s }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -161,7 +161,7 @@ define void @st3w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %addr)
+                                          float* %addr)
   ret void
 }
 
@@ -169,7 +169,7 @@ define void @st3w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; ST3D
 ;
 
-define void @st3d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+define void @st3d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i1> %pred, i64* %addr) {
 ; CHECK-LABEL: st3d_i64:
 ; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -177,11 +177,11 @@ define void @st3d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %addr)
+                                          i64* %addr)
   ret void
 }
 
-define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x i1> %pred, double* %addr) {
 ; CHECK-LABEL: st3d_f64:
 ; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -189,7 +189,7 @@ define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %addr)
+                                          double* %addr)
   ret void
 }
 
@@ -197,7 +197,7 @@ define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
 ; ST4B
 ;
 
-define void @st4b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+define void @st4b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, i8* %addr) {
 ; CHECK-LABEL: st4b_i8:
 ; CHECK: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -206,7 +206,7 @@ define void @st4b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
-                                          <vscale x 16 x i8>* %addr)
+                                          i8* %addr)
   ret void
 }
 
@@ -214,7 +214,7 @@ define void @st4b_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 1
 ; ST4H
 ;
 
-define void @st4h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i16> %v3, <vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+define void @st4h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i16> %v3, <vscale x 8 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: st4h_i16:
 ; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -223,11 +223,11 @@ define void @st4h_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i16> %v3,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x i16>* %addr)
+                                          i16* %addr)
   ret void
 }
 
-define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x i1> %pred, half* %addr) {
 ; CHECK-LABEL: st4h_f16:
 ; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -236,7 +236,7 @@ define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x half> %v3,
                                           <vscale x 8 x i1> %pred,
-                                          <vscale x 8 x half>* %addr)
+                                          half* %addr)
   ret void
 }
 
@@ -244,7 +244,7 @@ define void @st4h_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale
 ; ST4W
 ;
 
-define void @st4w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+define void @st4w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i1> %pred, i32* %addr) {
 ; CHECK-LABEL: st4w_i32:
 ; CHECK: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -253,11 +253,11 @@ define void @st4w_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i32> %v3,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x i32>* %addr)
+                                          i32* %addr)
   ret void
 }
 
-define void @st4w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+define void @st4w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x i1> %pred, float* %addr) {
 ; CHECK-LABEL: st4w_f32:
 ; CHECK: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -266,7 +266,7 @@ define void @st4w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x float> %v3,
                                           <vscale x 4 x i1> %pred,
-                                          <vscale x 4 x float>* %addr)
+                                          float* %addr)
   ret void
 }
 
@@ -274,7 +274,7 @@ define void @st4w_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscal
 ; ST4D
 ;
 
-define void @st4d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i64> %v3, <vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+define void @st4d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i64> %v3, <vscale x 2 x i1> %pred, i64* %addr) {
 ; CHECK-LABEL: st4d_i64:
 ; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -283,11 +283,11 @@ define void @st4d_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i64> %v3,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x i64>* %addr)
+                                          i64* %addr)
   ret void
 }
 
-define void @st4d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+define void @st4d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x i1> %pred, double* %addr) {
 ; CHECK-LABEL: st4d_f64:
 ; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0]
 ; CHECK-NEXT: ret
@@ -296,7 +296,7 @@ define void @st4d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x double> %v3,
                                           <vscale x 2 x i1> %pred,
-                                          <vscale x 2 x double>* %addr)
+                                          double* %addr)
   ret void
 }
 
@@ -387,29 +387,29 @@ define void @stnt1d_f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %pred, do
 }
 
 
-declare void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
-
-declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
-declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
-declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
-declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
-declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
-declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
-declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
+declare void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+
+declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+
+declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
+declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
+declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
+declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
+declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
+declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
 
 declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
 declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
-- 
2.7.4