SVE_VECTOR_TYPE("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, true, true, false)
SVE_VECTOR_TYPE("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, true, true, false)
-SVE_VECTOR_TYPE("__SVBFloat16_t", "__SVBFloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, false, false, true)
+SVE_VECTOR_TYPE("__SVBFloat16_t", "__SVBFloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, true, false, true)
//
// x2
SVE_VECTOR_TYPE("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 8, 32, true, true, false)
SVE_VECTOR_TYPE("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 4, 64, true, true, false)
+SVE_VECTOR_TYPE("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 16, 16, true, false, true)
//
// x3
//
SVE_VECTOR_TYPE("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 12, 32, true, true, false)
SVE_VECTOR_TYPE("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 6, 64, true, true, false)
+SVE_VECTOR_TYPE("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 24, 16, true, false, true)
//
// x4
//
SVE_VECTOR_TYPE("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 16, 32, true, true, false)
SVE_VECTOR_TYPE("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 8, 64, true, true, false)
+SVE_VECTOR_TYPE("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 32, 16, true, false, true)
+
SVE_PREDICATE_TYPE("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16)
#undef SVE_VECTOR_TYPE
// Load one quadword and replicate (scalar base)
def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">;
+multiclass StructLoad<string name, string proto, string i> {
+ def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructLoad]>;
+ let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def: SInst<name, proto, "b", MergeNone, i, [IsStructLoad]>;
+ }
+}
+
// Load N-element structure into N vectors (scalar base)
-def SVLD2 : SInst<"svld2[_{2}]", "2Pc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld2", [IsStructLoad]>;
-def SVLD3 : SInst<"svld3[_{2}]", "3Pc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld3", [IsStructLoad]>;
-def SVLD4 : SInst<"svld4[_{2}]", "4Pc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld4", [IsStructLoad]>;
+defm SVLD2 : StructLoad<"svld2[_{2}]", "2Pc", "aarch64_sve_ld2">;
+defm SVLD3 : StructLoad<"svld3[_{2}]", "3Pc", "aarch64_sve_ld3">;
+defm SVLD4 : StructLoad<"svld4[_{2}]", "4Pc", "aarch64_sve_ld4">;
// Load N-element structure into N vectors (scalar base, VL displacement)
-def SVLD2_VNUM : SInst<"svld2_vnum[_{2}]", "2Pcl", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld2", [IsStructLoad]>;
-def SVLD3_VNUM : SInst<"svld3_vnum[_{2}]", "3Pcl", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld3", [IsStructLoad]>;
-def SVLD4_VNUM : SInst<"svld4_vnum[_{2}]", "4Pcl", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld4", [IsStructLoad]>;
+defm SVLD2_VNUM : StructLoad<"svld2_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2">;
+defm SVLD3_VNUM : StructLoad<"svld3_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3">;
+defm SVLD4_VNUM : StructLoad<"svld4_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4">;
// Load one octoword and replicate (scalar base)
let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP64)" in {
def SVST1H_SCATTER_INDEX_S : MInst<"svst1h_scatter[_{2}base]_index[_{d}]", "vPuld", "ilUiUl", [IsScatterStore], MemEltTyInt16, "aarch64_sve_st1_scatter_scalar_offset">;
def SVST1W_SCATTER_INDEX_S : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl", [IsScatterStore], MemEltTyInt32, "aarch64_sve_st1_scatter_scalar_offset">;
+multiclass StructStore<string name, string proto, string i> {
+ def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructStore]>;
+ let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def: SInst<name, proto, "b", MergeNone, i, [IsStructStore]>;
+ }
+}
// Store N vectors into N-element structure (scalar base)
-def SVST2 : SInst<"svst2[_{d}]", "vPp2", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st2", [IsStructStore]>;
-def SVST3 : SInst<"svst3[_{d}]", "vPp3", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st3", [IsStructStore]>;
-def SVST4 : SInst<"svst4[_{d}]", "vPp4", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st4", [IsStructStore]>;
+defm SVST2 : StructStore<"svst2[_{d}]", "vPp2", "aarch64_sve_st2">;
+defm SVST3 : StructStore<"svst3[_{d}]", "vPp3", "aarch64_sve_st3">;
+defm SVST4 : StructStore<"svst4[_{d}]", "vPp4", "aarch64_sve_st4">;
// Store N vectors into N-element structure (scalar base, VL displacement)
-def SVST2_VNUM : SInst<"svst2_vnum[_{d}]", "vPpl2", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st2", [IsStructStore]>;
-def SVST3_VNUM : SInst<"svst3_vnum[_{d}]", "vPpl3", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st3", [IsStructStore]>;
-def SVST4_VNUM : SInst<"svst4_vnum[_{d}]", "vPpl4", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st4", [IsStructStore]>;
+defm SVST2_VNUM : StructStore<"svst2_vnum[_{d}]", "vPpl2", "aarch64_sve_st2">;
+defm SVST3_VNUM : StructStore<"svst3_vnum[_{d}]", "vPpl3", "aarch64_sve_st3">;
+defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">;
// Store one vector, with no truncation, non-temporal (scalar base)
def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
return GET_SVE_FP_VEC(DoubleTy, false, 8);
case BuiltinType::SveBFloat16:
return GET_SVE_FP_VEC(BFloat16Ty, false, 8);
+ case BuiltinType::SveBFloat16x2:
+ return GET_SVE_FP_VEC(BFloat16Ty, false, 16);
+ case BuiltinType::SveBFloat16x3:
+ return GET_SVE_FP_VEC(BFloat16Ty, false, 24);
+ case BuiltinType::SveBFloat16x4:
+ return GET_SVE_FP_VEC(BFloat16Ty, false, 32);
#undef GET_SVE_FP_VEC
case BuiltinType::Dependent:
#define BUILTIN_TYPE(Id, SingletonId)
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16x2_t test_svld2_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svld2_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK-NEXT: ret <vscale x 16 x bfloat> %[[LOAD]]
+ return SVE_ACLE_FUNC(svld2,_bf16,,)(pg, base);
+}
+
+
+svbfloat16x2_t test_svld2_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svld2_vnum_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BASE]], i64 %vnum, i64 0
+ // CHECK: %[[LOAD:.*]] = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK-NEXT: ret <vscale x 16 x bfloat> %[[LOAD]]
+ return SVE_ACLE_FUNC(svld2_vnum,_bf16,,)(pg, base, vnum);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16x3_t test_svld3_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svld3_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK-NEXT: ret <vscale x 24 x bfloat> %[[LOAD]]
+ return SVE_ACLE_FUNC(svld3,_bf16,,)(pg, base);
+}
+
+svbfloat16x3_t test_svld3_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svld3_vnum_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BASE]], i64 %vnum, i64 0
+ // CHECK: %[[LOAD:.*]] = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK-NEXT: ret <vscale x 24 x bfloat> %[[LOAD]]
+ return SVE_ACLE_FUNC(svld3_vnum,_bf16,,)(pg, base, vnum);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16x4_t test_svld4_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svld4_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK-NEXT: ret <vscale x 32 x bfloat> %[[LOAD]]
+ return SVE_ACLE_FUNC(svld4,_bf16,,)(pg, base);
+}
+
+svbfloat16x4_t test_svld4_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svld4_vnum_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BASE]], i64 %vnum, i64 0
+ // CHECK: %[[LOAD:.*]] = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK-NEXT: ret <vscale x 32 x bfloat> %[[LOAD]]
+ return SVE_ACLE_FUNC(svld4_vnum,_bf16,,)(pg, base, vnum);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
+{
+ // CHECK-LABEL: test_svst2_bf16
+ // CHECK-DAG: %[[V0:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> %data, i32 0)
+ // CHECK-DAG: %[[V1:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> %data, i32 1)
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> %[[V0]], <vscale x 8 x bfloat> %[[V1]], <vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK-NEXT: ret
+ return SVE_ACLE_FUNC(svst2,_bf16,,)(pg, base, data);
+}
+
+void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data)
+{
+ // CHECK-LABEL: test_svst2_vnum_bf16
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK-DAG: %[[V0:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> %data, i32 0)
+ // CHECK-DAG: %[[V1:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> %data, i32 1)
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> %[[V0]], <vscale x 8 x bfloat> %[[V1]], <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK-NEXT: ret
+ return SVE_ACLE_FUNC(svst2_vnum,_bf16,,)(pg, base, vnum, data);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
+{
+ // CHECK-LABEL: test_svst3_bf16
+ // CHECK-DAG: %[[V0:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> %data, i32 0)
+ // CHECK-DAG: %[[V1:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> %data, i32 1)
+ // CHECK-DAG: %[[V2:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> %data, i32 2)
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> %[[V0]], <vscale x 8 x bfloat> %[[V1]], <vscale x 8 x bfloat> %[[V2]], <vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK-NEXT: ret
+ return SVE_ACLE_FUNC(svst3,_bf16,,)(pg, base, data);
+}
+
+void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data)
+{
+ // CHECK-LABEL: test_svst3_vnum_bf16
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK-DAG: %[[V0:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> %data, i32 0)
+ // CHECK-DAG: %[[V1:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> %data, i32 1)
+ // CHECK-DAG: %[[V2:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> %data, i32 2)
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> %[[V0]], <vscale x 8 x bfloat> %[[V1]], <vscale x 8 x bfloat> %[[V2]], <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK-NEXT: ret
+ return SVE_ACLE_FUNC(svst3_vnum,_bf16,,)(pg, base, vnum, data);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
+{
+ // CHECK-LABEL: test_svst4_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK-DAG: %[[V0:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 0)
+ // CHECK-DAG: %[[V1:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 1)
+ // CHECK-DAG: %[[V2:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 2)
+ // CHECK-DAG: %[[V3:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 3)
+ // CHECK: call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> %[[V0]], <vscale x 8 x bfloat> %[[V1]], <vscale x 8 x bfloat> %[[V2]], <vscale x 8 x bfloat> %[[V3]], <vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK-NEXT: ret
+ return SVE_ACLE_FUNC(svst4,_bf16,,)(pg, base, data);
+}
+
+void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data)
+{
+ // CHECK-LABEL: test_svst4_vnum_bf16
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK-DAG: %[[V0:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 0)
+ // CHECK-DAG: %[[V1:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 1)
+ // CHECK-DAG: %[[V2:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 2)
+ // CHECK-DAG: %[[V3:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> %data, i32 3)
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> %[[V0]], <vscale x 8 x bfloat> %[[V1]], <vscale x 8 x bfloat> %[[V2]], <vscale x 8 x bfloat> %[[V3]], <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK-NEXT: ret
+ return SVE_ACLE_FUNC(svst4_vnum,_bf16,,)(pg, base, vnum, data);
+}
case 'b':
Signed = false;
Float = false;
+ BFloat = false;
ElementBitwidth /= 4;
break;
case 'o':
case '@':
Signed = false;
Float = false;
+ BFloat = false;
ElementBitwidth /= 4;
NumVectors = 0;
break;
case 'K':
Signed = true;
Float = false;
+ BFloat = false;
Bitwidth = ElementBitwidth;
NumVectors = 0;
break;
case 'L':
Signed = false;
Float = false;
+ BFloat = false;
Bitwidth = ElementBitwidth;
NumVectors = 0;
break;
Predicate = false;
Signed = false;
Float = false;
+ BFloat = false;
break;
case 'x':
Predicate = false;
Signed = true;
Float = false;
+ BFloat = false;
break;
case 'i':
Predicate = false;
Float = false;
+ BFloat = false;
ElementBitwidth = Bitwidth = 64;
NumVectors = 0;
Signed = false;
case 'I':
Predicate = false;
Float = false;
+ BFloat = false;
ElementBitwidth = Bitwidth = 32;
NumVectors = 0;
Signed = true;
case 'J':
Predicate = false;
Float = false;
+ BFloat = false;
ElementBitwidth = Bitwidth = 32;
NumVectors = 0;
Signed = true;
Predicate = false;
Signed = true;
Float = false;
+ BFloat = false;
ElementBitwidth = Bitwidth = 32;
NumVectors = 0;
break;
Predicate = false;
Signed = true;
Float = false;
+ BFloat = false;
ElementBitwidth = Bitwidth = 64;
NumVectors = 0;
break;
Predicate = false;
Signed = false;
Float = false;
+ BFloat = false;
ElementBitwidth = Bitwidth = 32;
NumVectors = 0;
break;
case 'g':
Signed = false;
Float = false;
+ BFloat = false;
ElementBitwidth = 64;
break;
case 't':
Signed = true;
Float = false;
+ BFloat = false;
ElementBitwidth = 32;
break;
case 'z':
Signed = false;
Float = false;
+ BFloat = false;
ElementBitwidth = 32;
break;
case 'O':
OS << "typedef __clang_svfloat64x4_t svfloat64x4_t;\n";
OS << "typedef __SVBool_t svbool_t;\n\n";
+ OS << "#ifdef __ARM_FEATURE_SVE_BF16\n";
+ OS << "typedef __clang_svbfloat16x2_t svbfloat16x2_t;\n";
+ OS << "typedef __clang_svbfloat16x3_t svbfloat16x3_t;\n";
+ OS << "typedef __clang_svbfloat16x4_t svbfloat16x4_t;\n";
+ OS << "#endif\n";
+
OS << "typedef enum\n";
OS << "{\n";
OS << " SV_POW2 = 0,\n";
case clang::BuiltinType::SveUint64x4:
case clang::BuiltinType::SveFloat16:
case clang::BuiltinType::SveBFloat16:
+ case clang::BuiltinType::SveBFloat16x2:
+ case clang::BuiltinType::SveBFloat16x3:
+ case clang::BuiltinType::SveBFloat16x4:
case clang::BuiltinType::SveFloat16x2:
case clang::BuiltinType::SveFloat16x3:
case clang::BuiltinType::SveFloat16x4:
SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B,
AArch64::ST2B_IMM);
return;
- } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H,
AArch64::ST2H_IMM);
return;
SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B,
AArch64::ST3B_IMM);
return;
- } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H,
AArch64::ST3H_IMM);
return;
SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B,
AArch64::ST4B_IMM);
return;
- } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H,
AArch64::ST4H_IMM);
return;
if (VT == MVT::nxv16i8) {
SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
return;
- } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
if (VT == MVT::nxv16i8) {
SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
return;
- } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
if (VT == MVT::nxv16i8) {
SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
return;
- } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s
;
; LD1RQB
ret <vscale x 16 x half> %res
}
+define <vscale x 16 x bfloat> @ld2h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ld2h_bf16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
+ ret <vscale x 16 x bfloat> %res
+}
+
;
; LD2W
;
ret <vscale x 24 x half> %res
}
+define <vscale x 24 x bfloat> @ld3h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ld3h_bf16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
+ ret <vscale x 24 x bfloat> %res
+}
+
;
; LD3W
;
ret <vscale x 32 x half> %res
}
+define <vscale x 32 x bfloat> @ld4h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ld4h_bf16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
+ ret <vscale x 32 x bfloat> %res
+}
+
;
; LD4W
;
declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
; WARN-NOT: warning
ret void
}
+define void @st2h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: st2h_bf16:
+; CHECK: st2h { z0.h, z1.h }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x i1> %pred,
+ bfloat* %addr)
+ ret void
+}
+
;
; ST2W
;
ret void
}
+define void @st3h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: st3h_bf16:
+; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x bfloat> %v2,
+ <vscale x 8 x i1> %pred,
+ bfloat* %addr)
+ ret void
+}
+
;
; ST3W
;
ret void
}
+define void @st4h_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: st4h_bf16:
+; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> %v0,
+ <vscale x 8 x bfloat> %v1,
+ <vscale x 8 x bfloat> %v2,
+ <vscale x 8 x bfloat> %v3,
+ <vscale x 8 x i1> %pred,
+ bfloat* %addr)
+ ret void
+}
+
;
; ST4W
;
declare void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
declare void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
declare void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
declare void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
declare void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
declare void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)