def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">;
def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
+ def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
+}
+
// Load one vector (scalar base, VL displacement)
def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">;
def SVLDFF1SW_VNUM : MInst<"svldff1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ldff1">;
def SVLDFF1UW_VNUM : MInst<"svldff1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ldff1">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def SVLDFF1_BF : MInst<"svldff1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
+ def SVLDFF1_VNUM_BF : MInst<"svldff1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
+}
+
// First-faulting load one vector (vector base)
def SVLDFF1_GATHER_BASES_U : MInst<"svldff1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ldff1_gather_scalar_offset">;
def SVLDFF1SB_GATHER_BASES_U : MInst<"svldff1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl", [IsGatherLoad], MemEltTyInt8, "aarch64_sve_ldff1_gather_scalar_offset">;
def SVLDNF1SW_VNUM : MInst<"svldnf1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ldnf1">;
def SVLDNF1UW_VNUM : MInst<"svldnf1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ldnf1">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def SVLDNF1_BF : MInst<"svldnf1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
+ def SVLDNF1_VNUM_BF : MInst<"svldnf1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
+}
+
// Load one vector, unextended load, non-temporal (scalar base)
def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
// Load one vector, unextended load, non-temporal (scalar base, VL displacement)
def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
+ def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
+}
+
// Load one quadword and replicate (scalar base)
def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+ def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">;
+}
+
multiclass StructLoad<string name, string proto, string i> {
def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructLoad]>;
let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svld1_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svld1_bf16'}}
+ return SVE_ACLE_FUNC(svld1,_bf16,,)(pg, base);
+}
+
+svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svld1_vnum_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svld1_vnum_bf16'}}
+ return SVE_ACLE_FUNC(svld1_vnum,_bf16,,)(pg, base, vnum);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svld1rq_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svld1rq_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+ // expected-warning@+1 {{implicit declaration of function 'svld1rq_bf16'}}
+ return SVE_ACLE_FUNC(svld1rq,_bf16,,)(pg, base);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svldff1_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svldff1_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svldff1_bf16'}}
+ return SVE_ACLE_FUNC(svldff1,_bf16,,)(pg, base);
+}
+
+svbfloat16_t test_svldff1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svldff1_vnum_bf16
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svldff1_vnum_bf16'}}
+ return SVE_ACLE_FUNC(svldff1_vnum,_bf16,,)(pg, base, vnum);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svldnf1_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svldnf1_bf16
+ // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svldnf1_bf16'}}
+ return SVE_ACLE_FUNC(svldnf1,_bf16,,)(pg, base);
+}
+
+svbfloat16_t test_svldnf1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svldnf1_vnum_bf16
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svldnf1_vnum_bf16'}}
+ return SVE_ACLE_FUNC(svldnf1_vnum,_bf16,,)(pg, base, vnum);
+}
--- /dev/null
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svldnt1_bf16(svbool_t pg, const bfloat16_t *base)
+{
+ // CHECK-LABEL: test_svldnt1_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %base)
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svldnt1_bf16'}}
+ return SVE_ACLE_FUNC(svldnt1,_bf16,,)(pg, base);
+}
+
+svbfloat16_t test_svldnt1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+ // CHECK-LABEL: test_svldnt1_vnum_bf16
+ // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+ // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+ // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+ // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+ // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // expected-warning@+1 {{implicit declaration of function 'svldnt1_vnum_bf16'}}
+ return SVE_ACLE_FUNC(svldnt1_vnum,_bf16,,)(pg, base, vnum);
+}
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous loads
- defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
- defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous loads
defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
// 8-element contiguous loads
- defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
- defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
- defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
- defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
+ defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
// 16-element contiguous loads
defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
}
// 2-element contiguous non-faulting loads
- defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i8>;
- defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>;
- defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i16>;
- defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>;
- defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i32>;
- defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>;
- defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i64>;
- defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1, nxv2i1, nxv2f64>;
+ defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i8>;
+ defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>;
+ defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i16>;
+ defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>;
+ defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i32>;
+ defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>;
+ defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i64>;
+ defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1, nxv2i1, nxv2f64>;
// 4-element contiguous non-faulting loads
- defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i8>;
- defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>;
- defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i16>;
- defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>;
- defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i32>;
- defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1, nxv4i1, nxv4f32>;
+ defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i8>;
+ defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>;
+ defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i16>;
+ defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>;
+ defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i32>;
+ defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1, nxv4i1, nxv4f32>;
// 8-element contiguous non-faulting loads
- defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i8>;
- defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>;
- defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i16>;
- defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1, nxv8i1, nxv8f16>;
+ defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i8>;
+ defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>;
+ defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i16>;
+ defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1, nxv8i1, nxv8f16>;
+ defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1, nxv8i1, nxv8bf16>;
// 16-element contiguous non-faulting loads
- defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
+ defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
// reg + reg
}
// 2-element contiguous first faulting loads
- defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
- defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
- defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
- defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
- defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
- defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
- defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
- defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
- defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
+ defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
+ defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
// 4-element contiguous first faulting loads
- defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
- defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
- defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
- defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
- defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
- defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
+ defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
// 8-element contiguous first faulting loads
- defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
- defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
- defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
- defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
+ defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
+ defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
// 16-element contiguous first faulting loads
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
return false;
Type *Ty = cast<VectorType>(DataType)->getElementType();
- if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
+ if (Ty->isBFloatTy() || Ty->isHalfTy() ||
+ Ty->isFloatTy() || Ty->isDoubleTy())
return true;
if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @ld1h_bf16_inbound(<vscale x 8 x i1> %pg, bfloat* %a) {
+; CHECK-LABEL: ld1h_bf16_inbound:
+; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 1
+ %base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base_scalar)
+ ret <vscale x 8 x bfloat> %load
+}
+
;
; LD1W
;
declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pg, bfloat* %a, i64 %index) {
+; CHECK-LABEL: ld1h_bf16
+; CHECK: ld1h { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base = getelementptr bfloat, bfloat* %a, i64 %index
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base)
+ ret <vscale x 8 x bfloat> %load
+}
+
define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %a, i64 %index) {
; CHECK-LABEL: ld1h_s:
; CHECK: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
ret <vscale x 8 x half> %res
}
+define <vscale x 8 x bfloat> @ld1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ld1h_bf16:
+; CHECK: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
+ ret <vscale x 8 x bfloat> %res
+}
+
define <vscale x 4 x i32> @ld1h_s(<vscale x 4 x i1> %pred, i16* %addr) {
; CHECK-LABEL: ld1h_s:
; CHECK: ld1h { z0.s }, p0/z, [x0]
declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*)
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @ldff1h_bf16(<vscale x 8 x i1> %pg, bfloat* %a) {
+; CHECK-LABEL: ldff1h_bf16:
+; CHECK: ldff1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %a)
+ ret <vscale x 8 x bfloat> %load
+}
+
define <vscale x 8 x half> @ldff1h_f16_reg(<vscale x 8 x i1> %pg, half* %a, i64 %offset) {
; CHECK-LABEL: ldff1h_f16_reg:
; CHECK: ldff1h { z0.h }, p0/z, [x0, x1, lsl #1]
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @ldff1h_bf16_reg(<vscale x 8 x i1> %pg, bfloat* %a, i64 %offset) {
+; CHECK-LABEL: ldff1h_bf16_reg:
+; CHECK: ldff1h { z0.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base = getelementptr bfloat, bfloat* %a, i64 %offset
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base)
+ ret <vscale x 8 x bfloat> %load
+}
+
;
; LDFF1SH
;
declare <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1>, i16*)
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @ldnf1h_bf16(<vscale x 8 x i1> %pg, bfloat* %a) {
+; CHECK-LABEL: ldnf1h_bf16:
+; CHECK: ldnf1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %a)
+ ret <vscale x 8 x bfloat> %load
+}
+
define <vscale x 8 x half> @ldnf1h_f16_inbound(<vscale x 8 x i1> %pg, half* %a) {
; CHECK-LABEL: ldnf1h_f16_inbound:
; CHECK: ldnf1h { z0.h }, p0/z, [x0, #1, mul vl]
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @ldnf1h_bf16_inbound(<vscale x 8 x i1> %pg, bfloat* %a) {
+; CHECK-LABEL: ldnf1h_bf16_inbound:
+; CHECK: ldnf1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
+ %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 1
+ %base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
+ %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base_scalar)
+ ret <vscale x 8 x bfloat> %load
+}
+
define <vscale x 4 x i32> @ldnf1b_s(<vscale x 4 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldnf1b_s:
; CHECK: ldnf1b { z0.s }, p0/z, [x0]
declare <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1>, i16*)
ret <vscale x 8 x half> %res
}
+define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ld1rqh_bf16:
+; CHECK: ld1rqh { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ld1rqh_bf16_imm:
+; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds bfloat, bfloat* %addr, i16 -8
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %ptr)
+ ret <vscale x 8 x bfloat> %res
+}
+
;
; LD1RQW
;
ret <vscale x 8 x half> %res
}
+define <vscale x 8 x bfloat> @ldnt1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: ldnt1h_bf16:
+; CHECK: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %pred,
+ bfloat* %addr)
+ ret <vscale x 8 x bfloat> %res
+}
+
;
; LDNT1W
;
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, i32*)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1>, i64*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1>, float*)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1>, double*)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, i32*)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i64*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*)
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*)
ret <vscale x 8 x half> %load
}
+define <vscale x 8 x bfloat> @masked_load_nxv8bf16(<vscale x 8 x bfloat> *%a, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv8bf16:
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat> *%a, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> undef)
+ ret <vscale x 8 x bfloat> %load
+}
+
;
; Masked Stores
;
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)