From 17a13507d776059bd7022ea4ddf7d5b9fab5294b Mon Sep 17 00:00:00 2001 From: Mihail Ionescu Date: Thu, 27 Feb 2020 16:00:48 +0000 Subject: [PATCH] [GCC][PATCH][ARM] Add vreinterpret, vdup, vget and vset bfloat16 intrinsics This patch adds support for the bf16 vector create, get, set, duplicate and reinterpret intrinsics. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest gcc/ChangeLog: 2020-02-27 Mihail Ionescu * (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the beginning of the file. (vcreate_bf16, vcombine_bf16): New. (vdup_n_bf16, vdupq_n_bf16): New. (vdup_lane_bf16, vdup_laneq_bf16): New. (vdupq_lane_bf16, vdupq_laneq_bf16): New. (vduph_lane_bf16, vduph_laneq_bf16): New. (vset_lane_bf16, vsetq_lane_bf16): New. (vget_lane_bf16, vgetq_lane_bf16): New. (vget_high_bf16, vget_low_bf16): New. (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New. (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. (vreinterpretq_bf16_p128): New. (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. (vreinterpretq_p128_bf16): New. * config/arm/arm_neon_builtins.def (VDX): Add V4BF. (V_elem): Likewise. (V_elem_l): Likewise. (VD_LANE): Likewise. (VQX) Add V8BF. (V_DOUBLE): Likewise. (VDQX): Add V4BF and V8BF. (V_two_elem, V_three_elem, V_four_elem): Likewise. (V_reg): Likewise. (V_HALF): Likewise. (V_double_vector_mode): Likewise. (V_cmp_result): Likewise. (V_uf_sclr): Likewise. (V_sz_elem): Likewise. (Is_d_reg): Likewise. (V_mode_nunits): Likewise. * config/arm/neon.md (neon_vdup_lane): Enable for BFloat. gcc/testsuite/ChangeLog: 2020-02-27 Mihail Ionescu * gcc.target/arm/bf16_dup.c: New test. * gcc.target/arm/bf16_reinterpret.c: Likewise. --- gcc/ChangeLog | 57 +++ gcc/config/arm/arm_neon.h | 532 +++++++++++++++++++++++- gcc/config/arm/arm_neon_builtins.def | 10 +- gcc/config/arm/iterators.md | 25 +- gcc/config/arm/neon.md | 26 +- gcc/testsuite/ChangeLog | 5 + gcc/testsuite/gcc.target/arm/bf16_dup.c | 96 +++++ gcc/testsuite/gcc.target/arm/bf16_reinterpret.c | 435 +++++++++++++++++++ 8 files changed, 1158 insertions(+), 28 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/bf16_dup.c create mode 100644 gcc/testsuite/gcc.target/arm/bf16_reinterpret.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 997bed1..6a780a1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,60 @@ +2020-02-27 Mihail Ionescu + + * (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the + beginning of the file. + (vcreate_bf16, vcombine_bf16): New. + (vdup_n_bf16, vdupq_n_bf16): New. + (vdup_lane_bf16, vdup_laneq_bf16): New. + (vdupq_lane_bf16, vdupq_laneq_bf16): New. + (vduph_lane_bf16, vduph_laneq_bf16): New. + (vset_lane_bf16, vsetq_lane_bf16): New. + (vget_lane_bf16, vgetq_lane_bf16): New. + (vget_high_bf16, vget_low_bf16): New. + (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. + (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. + (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. + (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. + (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. + (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. + (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. + (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. + (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. + (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. + (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New. + (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. + (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. + (vreinterpretq_bf16_p128): New. + (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. + (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. + (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. + (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. + (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. + (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. + (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. + (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. + (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. + (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. + (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. + (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. + (vreinterpretq_p128_bf16): New. + * config/arm/arm_neon_builtins.def (VDX): Add V4BF. + (V_elem): Likewise. + (V_elem_l): Likewise. + (VD_LANE): Likewise. + (VQX) Add V8BF. + (V_DOUBLE): Likewise. + (VDQX): Add V4BF and V8BF. + (V_two_elem, V_three_elem, V_four_elem): Likewise. + (V_reg): Likewise. + (V_HALF): Likewise. + (V_double_vector_mode): Likewise. + (V_cmp_result): Likewise. + (V_uf_sclr): Likewise. + (V_sz_elem): Likewise. + (Is_d_reg): Likewise. + (V_mode_nunits): Likewise. + * config/arm/neon.md (neon_vdup_lane): Enable for BFloat16. + 2020-02-27 Andrew Stubbs * config/gcn/gcn-valu.md (VEC_SUBDWORD_MODE): New mode iterator. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index d2ebee4..81c407f 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -42,6 +42,18 @@ extern "C" { #include #include +/* For big-endian, GCC's vector indices are reversed within each 64 + bits compared to the architectural lane indices used by Neon + intrinsics. */ +#ifdef __ARM_BIG_ENDIAN +#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0])) +#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1)) +#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1)) +#else +#define __arm_lane(__vec, __idx) __idx +#define __arm_laneq(__vec, __idx) __idx +#endif + typedef __simd64_int8_t int8x8_t; typedef __simd64_int16_t int16x4_t; typedef __simd64_int32_t int32x2_t; @@ -6144,18 +6156,6 @@ vget_lane_s32 (int32x2_t __a, const int __b) were marked always-inline so there were no call sites, the declaration would nonetheless raise an error. Hence, we must use a macro instead. */ - /* For big-endian, GCC's vector indices are reversed within each 64 - bits compared to the architectural lane indices used by Neon - intrinsics. */ -#ifdef __ARM_BIG_ENDIAN -#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0])) -#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1)) -#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1)) -#else -#define __arm_lane(__vec, __idx) __idx -#define __arm_laneq(__vec, __idx) __idx -#endif - #define vget_lane_f16(__v, __idx) \ __extension__ \ ({ \ @@ -14476,6 +14476,15 @@ vreinterpret_p16_u32 (uint32x2_t __a) #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f16_bf16 (bfloat16x4_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vreinterpret_f16_p8 (poly8x8_t __a) { return (float16x4_t) __a; @@ -15688,6 +15697,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a) #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f16_bf16 (bfloat16x8_t __a) +{ + return (float16x8_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vreinterpretq_f16_f32 (float32x4_t __a) { return (float16x8_t) __a; @@ -18823,6 +18841,496 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) #pragma GCC push_options #pragma GCC target ("arch=armv8.2-a+bf16") +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcreate_bf16 (uint64_t __a) +{ + return (bfloat16x4_t) __a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_bf16 (bfloat16_t __a) +{ + return __builtin_neon_vdup_nv4bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_n_bf16 (bfloat16_t __a) +{ + return __builtin_neon_vdup_nv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __builtin_neon_vdup_lanev4bf (__a, __b); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __builtin_neon_vdup_lanev8bf (__a, __b); +} + +#define vset_lane_bf16(__e, __v, __idx) \ + __extension__ \ + ({ \ + bfloat16_t __elem = (__e); \ + bfloat16x4_t __vec = (__v); \ + __builtin_arm_lane_check (4, __idx); \ + __vec[__arm_lane(__vec, __idx)] = __elem; \ + __vec; \ + }) + +#define vsetq_lane_bf16(__e, __v, __idx) \ + __extension__ \ + ({ \ + bfloat16_t __elem = (__e); \ + bfloat16x8_t __vec = (__v); \ + __builtin_arm_lane_check (8, __idx); \ + __vec[__arm_laneq(__vec, __idx)] = __elem; \ + __vec; \ + }) + +#define vget_lane_bf16(__v, __idx) \ + __extension__ \ + ({ \ + bfloat16x4_t __vec = (__v); \ + __builtin_arm_lane_check (4, __idx); \ + bfloat16_t __res = __vec[__arm_lane(__vec, __idx)]; \ + __res; \ + }) + +#define vgetq_lane_bf16(__v, __idx) \ + __extension__ \ + ({ \ + bfloat16x8_t __vec = (__v); \ + __builtin_arm_lane_check (8, __idx); \ + bfloat16_t __res = __vec[__arm_laneq(__vec, __idx)]; \ + __res; \ + }) + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdup_n_bf16( vgetq_lane_bf16 (__a, __b)); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdupq_n_bf16( vgetq_lane_bf16 (__a, __b)); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return vget_lane_bf16 (__a, __b); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vgetq_lane_bf16 (__a, __b); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_high_bf16 (bfloat16x8_t __a) +{ + return __builtin_neon_vget_highv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_low_bf16 (bfloat16x8_t __a) +{ + return __builtin_neon_vget_lowv8bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_neon_vcombinev4bf (__a, __b); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u8 (uint8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u16 (uint16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u32 (uint32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u64 (uint64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s8 (int8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s16 (int16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s32 (int32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s64 (int64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p8 (poly8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p16 (poly16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p64 (poly64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f16 (float16x4_t __a) +{ + return (bfloat16x4_t)__a; +} +#endif + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f32 (float32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u8 (uint8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u16 (uint16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u32 (uint32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u64 (uint64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s8 (int8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s16 (int16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s32 (int32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s64 (int64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p8 (poly8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p16 (poly16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p64 (poly64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p128 (poly128_t __a) +{ + return (bfloat16x8_t)__a; +} + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f16 (float16x8_t __a) +{ + return (bfloat16x8_t)__a; +} +#endif + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f32 (float32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s8_bf16 (bfloat16x4_t __a) +{ + return (int8x8_t)__a; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s16_bf16 (bfloat16x4_t __a) +{ + return (int16x4_t)__a; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s32_bf16 (bfloat16x4_t __a) +{ + return (int32x2_t)__a; +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s64_bf16 (bfloat16x4_t __a) +{ + return (int64x1_t)__a; +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u8_bf16 (bfloat16x4_t __a) +{ + return (uint8x8_t)__a; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u16_bf16 (bfloat16x4_t __a) +{ + return (uint16x4_t)__a; +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u32_bf16 (bfloat16x4_t __a) +{ + return (uint32x2_t)__a; +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u64_bf16 (bfloat16x4_t __a) +{ + return (uint64x1_t)__a; +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f32_bf16 (bfloat16x4_t __a) +{ + return (float32x2_t)__a; +} + +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p8_bf16 (bfloat16x4_t __a) +{ + return (poly8x8_t)__a; +} + +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p16_bf16 (bfloat16x4_t __a) +{ + return (poly16x4_t)__a; +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p64_bf16 (bfloat16x4_t __a) +{ + return (poly64x1_t)__a; +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s8_bf16 (bfloat16x8_t __a) +{ + return (int8x16_t)__a; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s16_bf16 (bfloat16x8_t __a) +{ + return (int16x8_t)__a; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s32_bf16 (bfloat16x8_t __a) +{ + return (int32x4_t)__a; +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s64_bf16 (bfloat16x8_t __a) +{ + return (int64x2_t)__a; +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u8_bf16 (bfloat16x8_t __a) +{ + return (uint8x16_t)__a; +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u16_bf16 (bfloat16x8_t __a) +{ + return (uint16x8_t)__a; +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u32_bf16 (bfloat16x8_t __a) +{ + return (uint32x4_t)__a; +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u64_bf16 (bfloat16x8_t __a) +{ + return (uint64x2_t)__a; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f32_bf16 (bfloat16x8_t __a) +{ + return (float32x4_t)__a; +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p8_bf16 (bfloat16x8_t __a) +{ + return (poly8x16_t)__a; +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p16_bf16 (bfloat16x8_t __a) +{ + return (poly16x8_t)__a; +} + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p64_bf16 (bfloat16x8_t __a) +{ + return (poly64x2_t)__a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_bf16 (bfloat16x8_t __a) +{ + return (poly128_t)__a; +} + __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 4a6f4cf..4b4d1c8 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane, VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di) VAR10 (UNOP, vdup_n, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR2 (UNOP, vdup_n, v8hf, v4hf) +VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf) VAR10 (GETLANE, vdup_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR2 (GETLANE, vdup_lane, v8hf, v4hf) -VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di) -VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di) +VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf) +VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf) +VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di) +VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di) VAR3 (UNOP, vmovn, v8hi, v4si, v2di) VAR3 (UNOP, vqmovns, v8hi, v4si, v2di) VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index b435a05..ab30c37 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -82,14 +82,14 @@ (define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI]) ;; Double-width vector modes plus 64-bit elements. -(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) +(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI]) ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) ;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane. -(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF]) +(define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF]) ;; Double-width vector modes without floating-point elements. (define_mode_iterator VDI [V8QI V4HI V2SI]) @@ -104,7 +104,7 @@ (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) ;; Quad-width vector modes plus 64-bit elements. -(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) +(define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI]) ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -153,7 +153,7 @@ ;; Vector modes, including 64-bit integer elements. (define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI - V4HF V8HF V2SF V4SF DI V2DI]) + V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI]) ;; Vector modes including 64-bit integer elements, but no floats. (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI]) @@ -522,6 +522,7 @@ (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") (V4HI "HI") (V8HI "HI") (V4HF "HF") (V8HF "HF") + (V4BF "BF") (V8BF "BF") (V2SI "SI") (V4SI "SI") (V2SF "SF") (V4SF "SF") (DI "DI") (V2DI "DI")]) @@ -530,6 +531,7 @@ (define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi") (V4HI "hi") (V8HI "hi") (V4HF "hf") (V8HF "hf") + (V4BF "bf") (V8BF "bf") (V2SI "si") (V4SI "si") (V2SF "sf") (V4SF "sf") (DI "di") (V2DI "di")]) @@ -547,6 +549,7 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V4HI "SI") (V8HI "SI") (V4HF "SF") (V8HF "SF") + (V4BF "BF") (V8BF "BF") (V2SI "V2SI") (V4SI "V2SI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) @@ -567,6 +570,7 @@ (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") (V4HF "BLK") (V8HF "BLK") + (V4BF "BLK") (V8BF "BLK") (V2SI "BLK") (V4SI "BLK") (V2SF "BLK") (V4SF "BLK") (DI "EI") (V2DI "EI")]) @@ -575,6 +579,7 @@ (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (V4HI "V4HI") (V8HI "V4HI") (V4HF "V4HF") (V8HF "V4HF") + (V4BF "V4BF") (V8BF "V4BF") (V2SI "V4SI") (V4SI "V4SI") (V2SF "V4SF") (V4SF "V4SF") (DI "OI") (V2DI "OI")]) @@ -583,6 +588,7 @@ (define_mode_attr V_reg [(V8QI "P") (V16QI "q") (V4HI "P") (V8HI "q") (V4HF "P") (V8HF "q") + (V4BF "P") (V8BF "q") (V2SI "P") (V4SI "q") (V2SF "P") (V4SF "q") (DI "P") (V2DI "q") @@ -613,7 +619,8 @@ (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (V8HF "V4HF") (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") - (V2DI "DI") (V4HF "HF")]) + (V2DI "DI") (V4HF "HF") + (V4BF "BF") (V8BF "V4BF")]) ;; Same, but lower-case. (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") @@ -624,7 +631,7 @@ (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI") (V2SI "V4SI") (V4HF "V8HF") (V2SF "V4SF") (DF "V2DF") - (DI "V2DI")]) + (DI "V2DI") (V4BF "V8BF")]) ;; Same, but lower-case. (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi") @@ -643,6 +650,7 @@ (V4SI "V2SI") (V4SF "V2SF") (V8QI "V8QI") (V4HI "V4HI") (V2SI "V2SI") (V2SF "V2SF") + (V8BF "V4BF") (V4BF "V4BF") (V8HF "V4HF") (V4HF "V4HF")]) ;; Mode of result of comparison operations (and bit-select operand 1). @@ -650,6 +658,7 @@ (V4HI "V4HI") (V8HI "V8HI") (V2SI "V2SI") (V4SI "V4SI") (V4HF "V4HI") (V8HF "V8HI") + (V4BF "V4HI") (V8BF "V8HI") (V2SF "V2SI") (V4SF "V4SI") (DI "DI") (V2DI "V2DI")]) @@ -691,6 +700,7 @@ (V4HI "u16") (V8HI "u16") (V2SI "32") (V4SI "32") (V4HF "u16") (V8HF "u16") + (V4BF "u16") (V8BF "u16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") @@ -698,6 +708,7 @@ (V2SI "32") (V4SI "32") (DI "64") (V2DI "64") (V4HF "16") (V8HF "16") + (V4BF "16") (V8BF "16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") @@ -768,10 +779,12 @@ (V2SI "true") (V4SI "false") (V2SF "true") (V4SF "false") (DI "true") (V2DI "false") + (V4BF "true") (V8BF "false") (V4HF "true") (V8HF "false")]) (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") (V4HF "4") (V8HF "8") + (V4BF "4") (V8BF "8") (V4HI "4") (V8HI "8") (V2SI "2") (V4SI "4") (V2SF "2") (V4SF "4") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 80e94de..fae8213 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3737,6 +3737,22 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_from_gp_q")] ) +(define_insn "neon_vdup_nv4bf" + [(set (match_operand:V4BF 0 "s_register_operand" "=w") + (vec_duplicate:V4BF (match_operand:BF 1 "s_register_operand" "r")))] + "TARGET_NEON" + "vdup.16\t%P0, %1" + [(set_attr "type" "neon_from_gp")] +) + +(define_insn "neon_vdup_nv8bf" + [(set (match_operand:V8BF 0 "s_register_operand" "=w") + (vec_duplicate:V8BF (match_operand:BF 1 "s_register_operand" "r")))] + "TARGET_NEON" + "vdup.16\t%q0, %1" + [(set_attr "type" "neon_from_gp_q")] +) + (define_insn "neon_vdup_n" [(set (match_operand:V32 0 "s_register_operand" "=w,w") (vec_duplicate:V32 (match_operand: 1 "s_register_operand" "r,t")))] @@ -3791,12 +3807,12 @@ if (BYTES_BIG_ENDIAN) ) (define_insn "neon_vdup_lane_internal" - [(set (match_operand:VH 0 "s_register_operand" "=w") - (vec_duplicate:VH + [(set (match_operand:VHFBF 0 "s_register_operand" "=w") + (vec_duplicate:VHFBF (vec_select: (match_operand: 1 "s_register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] - "TARGET_NEON && TARGET_FP16" + "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)" { if (BYTES_BIG_ENDIAN) { @@ -3832,10 +3848,10 @@ if (BYTES_BIG_ENDIAN) }) (define_expand "neon_vdup_lane" - [(match_operand:VH 0 "s_register_operand") + [(match_operand:VHFBF 0 "s_register_operand") (match_operand: 1 "s_register_operand") (match_operand:SI 2 "immediate_operand")] - "TARGET_NEON && TARGET_FP16" + "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)" { if (BYTES_BIG_ENDIAN) { diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index b2a82be..d9b1c3c 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2020-02-27 Mihail Ionescu + + * gcc.target/arm/bf16_dup.c: New test. + * gcc.target/arm/bf16_reinterpret.c: Likewise. + 2020-02-27 Will Schmidt * lib/target_supports.exp (check_effective_target_has_arch_pwr5): New. diff --git a/gcc/testsuite/gcc.target/arm/bf16_dup.c b/gcc/testsuite/gcc.target/arm/bf16_dup.c new file mode 100644 index 0000000..94be99a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/bf16_dup.c @@ -0,0 +1,96 @@ +/* { dg-do assemble { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -march=armv8.2-a+bf16+fp16 -mfloat-abi=softfp" } */ + +#include "arm_neon.h" + +float32x2_t +test_vbfdot_vcreate (float32x2_t r, uint64_t a, uint64_t b) +{ + bfloat16x4_t _a = vcreate_bf16(a); + bfloat16x4_t _b = vcreate_bf16(b); + + return vbfdot_f32 (r, _a, _b); +} +/* { dg-final { scan-assembler {vdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+} } } */ + +bfloat16x8_t test_vcombine_bf16 (bfloat16x4_t a, bfloat16x4_t b) +{ + return vcombine_bf16 (a, b); +} + +bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a) +{ + return vget_high_bf16 (a); +} + +bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a) +{ + return vget_low_bf16 (a); +} + +bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a) +{ + return vget_lane_bf16 (a, 1); +} + +bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a) +{ + return vgetq_lane_bf16 (a, 7); +} + +bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b) +{ + return vset_lane_bf16 (a, b, 1); +} + +bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b) +{ + return vsetq_lane_bf16 (a, b, 7); +} + +bfloat16x4_t vdup_test (bfloat16_t a) +{ + return vdup_n_bf16 (a); +} +/* { dg-final { scan-assembler {vdup\.16\td[0-9]+, r[0-9]+} } } */ + +bfloat16x8_t vdupq_test (bfloat16_t a) +{ + return vdupq_n_bf16 (a); +} +/* { dg-final { scan-assembler {vdup\.16\tq[0-9]+, r[0-9]+} } } */ + + +bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a) +{ + return vdup_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } } */ + +bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a) +{ + return vdupq_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } } */ + +bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a) +{ + return vdup_laneq_bf16 (a, 3); +} + +bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a) +{ + return vdupq_laneq_bf16 (a, 3); +} + +bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a) +{ + return vduph_lane_bf16 (a, 1); +} + +bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a) +{ + return vduph_laneq_bf16 (a, 7); +} diff --git a/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c new file mode 100644 index 0000000..e7d30a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c @@ -0,0 +1,435 @@ +/* { dg-do assemble { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16 -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8" } */ + +#include + +float32x2_t +test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s8(a); + bfloat16x4_t _b = vreinterpret_bf16_s8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s16(a); + bfloat16x4_t _b = vreinterpret_bf16_s16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s32(a); + bfloat16x4_t _b = vreinterpret_bf16_s32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s64(a); + bfloat16x4_t _b = vreinterpret_bf16_s64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u8(a); + bfloat16x4_t _b = vreinterpret_bf16_u8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u16(a); + bfloat16x4_t _b = vreinterpret_bf16_u16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u32(a); + bfloat16x4_t _b = vreinterpret_bf16_u32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u64(a); + bfloat16x4_t _b = vreinterpret_bf16_u64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p8(a); + bfloat16x4_t _b = vreinterpret_bf16_p8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p16(a); + bfloat16x4_t _b = vreinterpret_bf16_p16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p64(a); + bfloat16x4_t _b = vreinterpret_bf16_p64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f16(a); + bfloat16x4_t _b = vreinterpret_bf16_f16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f32(a); + bfloat16x4_t _b = vreinterpret_bf16_f32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s8(a); + bfloat16x8_t _b = vreinterpretq_bf16_s8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s16(a); + bfloat16x8_t _b = vreinterpretq_bf16_s16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s32(a); + bfloat16x8_t _b = vreinterpretq_bf16_s32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s64(a); + bfloat16x8_t _b = vreinterpretq_bf16_s64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u8(a); + bfloat16x8_t _b = vreinterpretq_bf16_u8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u16(a); + bfloat16x8_t _b = vreinterpretq_bf16_u16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u32(a); + bfloat16x8_t _b = vreinterpretq_bf16_u32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u64(a); + bfloat16x8_t _b = vreinterpretq_bf16_u64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p8(a); + bfloat16x8_t _b = vreinterpretq_bf16_p8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p16(a); + bfloat16x8_t _b = vreinterpretq_bf16_p16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p64(a); + bfloat16x8_t _b = vreinterpretq_bf16_p64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p128(a); + bfloat16x8_t _b = vreinterpretq_bf16_p128(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f16(a); + bfloat16x8_t _b = vreinterpretq_bf16_f16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f32(a); + bfloat16x8_t _b = vreinterpretq_bf16_f32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 13 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 14 } } */ + +int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b) +{ + int8x8_t _a = vreinterpret_s8_bf16 (a); + return vadd_s8 (_a, b); +} + +int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b) +{ + int16x4_t _a = vreinterpret_s16_bf16 (a); + return vadd_s16 (_a, b); +} + +int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b) +{ + int32x2_t _a = vreinterpret_s32_bf16 (a); + return vadd_s32 (_a, b); +} + +int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b) +{ + int64x1_t _a = vreinterpret_s64_bf16 (a); + return vrshl_s64 (_a, b); +} + +uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b) +{ + uint8x8_t _a = vreinterpret_u8_bf16 (a); + return vadd_u8 (_a, b); +} + +uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b) +{ + uint16x4_t _a = vreinterpret_u16_bf16 (a); + return vadd_u16 (_a, b); +} + +uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b) +{ + uint32x2_t _a = vreinterpret_u32_bf16 (a); + return vadd_u32 (_a, b); +} + +uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b) +{ + uint64x1_t _a = vreinterpret_u64_bf16 (a); + return vrshl_u64 (_a, b); +} + +poly8x8x2_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b) +{ + poly8x8_t _a = vreinterpret_p8_bf16 (a); + return vzip_p8 (_a, b); +} + +poly16x4x2_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b) +{ + poly16x4_t _a = vreinterpret_p16_bf16 (a); + return vzip_p16 (_a, b); +} + +poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b) +{ + poly64x1_t _a = vreinterpret_p64_bf16 (a); + return vsli_n_p64 (_a, b, 3); +} + +float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b) +{ + float32x2_t _a = vreinterpret_f32_bf16 (a); + return vsub_f32 (_a, b); +} + +int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b) +{ + int8x16_t _a = vreinterpretq_s8_bf16 (a); + return vaddq_s8 (_a, b); +} + +int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b) +{ + int16x8_t _a = vreinterpretq_s16_bf16 (a); + return vaddq_s16 (_a, b); +} + +int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b) +{ + int32x4_t _a = vreinterpretq_s32_bf16 (a); + return vaddq_s32 (_a, b); +} + +int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b) +{ + int64x2_t _a = vreinterpretq_s64_bf16 (a); + return vaddq_s64 (_a, b); +} + +uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b) +{ + uint8x16_t _a = vreinterpretq_u8_bf16 (a); + return vaddq_u8 (_a, b); +} + +uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b) +{ + uint16x8_t _a = vreinterpretq_u16_bf16 (a); + return vaddq_u16 (_a, b); +} + +uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b) +{ + uint32x4_t _a = vreinterpretq_u32_bf16 (a); + return vaddq_u32 (_a, b); +} + +uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b) +{ + uint64x2_t _a = vreinterpretq_u64_bf16 (a); + return vaddq_u64 (_a, b); +} + +poly8x16x2_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b) +{ + poly8x16_t _a = vreinterpretq_p8_bf16 (a); + return vzipq_p8 (_a, b); +} + +poly16x8x2_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b) +{ + poly16x8_t _a = vreinterpretq_p16_bf16 (a); + return vzipq_p16 (_a, b); +} + +poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b) +{ + poly64x2_t _a = vreinterpretq_p64_bf16 (a); + return vsliq_n_p64 (_a, b, 3); +} + +poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b) +{ + poly128_t _a = vreinterpretq_p128_bf16 (a); + return _a; +} + +float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b) +{ + float32x4_t _a = vreinterpretq_f32_bf16 (a); + return vsubq_f32 (_a, b); +} + +float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a) +{ + return vreinterpret_f16_bf16 (a); +} + +float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a) +{ + return vreinterpretq_f16_bf16 (a); +} + +/* { dg-final { scan-assembler-times {\tvadd.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tvadd.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvadd.i64\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */ + +/* { dg-final { scan-assembler {\tvsub.f32\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvsub.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */ + +/* { dg-final { scan-assembler {\tvzip.8\td[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvzip.16\td[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvzip.8\tq[0-9]+, q[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvzip.16\tq[0-9]+, q[0-9]+\n} } } */ + +/* { dg-final { scan-assembler {\tvrshl.s64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */ +/* { dg-final { scan-assembler {\tvrshl.u64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */ + +/* { dg-final { scan-assembler {\tvsli.64\td[0-9]+, d[0-9]+, #3\n} } } */ +/* { dg-final { scan-assembler {\tvsli.64\tq[0-9]+, q[0-9]+, #3\n} } } */ -- 2.7.4