From 8ea6c1b89a20ef7c675535ba1994355361dac977 Mon Sep 17 00:00:00 2001 From: Mihail Ionescu Date: Tue, 18 Feb 2020 14:23:09 +0000 Subject: [PATCH] aarch64: Add bfloat16 vdup and vreinterpret ACLE intrinsics This patch adds support for the bf16 duplicate and reinterpret intrinsics. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest 2020-02-25 Mihail Ionescu gcc/ * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF. (VALL_F16): Likewise. (VALLDI_F16): Likewise. (Vtype): Likewise. (Vetype): Likewise. (vswap_width_name): Likewise. (VSWAP_WIDTH): Likewise. (Vel): Likewise. (VEL): Likewise. (q): Likewise. * config/aarch64/arm_neon.h (vset_lane_bf16, vsetq_lane_bf16): New. (vget_lane_bf16, vgetq_lane_bf16): New. (vcreate_bf16): New. (vdup_n_bf16, vdupq_n_bf16): New. (vdup_lane_bf16, vdup_laneq_bf16): New. (vdupq_lane_bf16, vdupq_laneq_bf16): New. (vduph_lane_bf16, vduph_laneq_bf16): New. (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New (vreinterpret_bf16_f16, vreinterpretq_bf16_f16): New (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. (vreinterpretq_bf16_p128): New. (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. (vreinterpret_f64_bf16,vreinterpretq_f64_bf16): New. (vreinterpret_f16_bf16,vreinterpretq_f16_bf16): New. (vreinterpretq_p128_bf16): New. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test. --- gcc/ChangeLog | 50 ++ gcc/config/aarch64/arm_neon.h | 501 +++++++++++++++++++++ gcc/config/aarch64/iterators.md | 14 +- gcc/testsuite/ChangeLog | 5 + .../aarch64/advsimd-intrinsics/bf16_dup.c | 85 ++++ .../aarch64/advsimd-intrinsics/bf16_reinterpret.c | 466 +++++++++++++++++++ 6 files changed, 1118 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index bbb4a65..334a16e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,53 @@ +2020-02-25 Mihail Ionescu + + * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF. + (VALL_F16): Likewise. + (VALLDI_F16): Likewise. + (Vtype): Likewise. + (Vetype): Likewise. + (vswap_width_name): Likewise. + (VSWAP_WIDTH): Likewise. + (Vel): Likewise. + (VEL): Likewise. + (q): Likewise. + * config/aarch64/arm_neon.h (vset_lane_bf16, vsetq_lane_bf16): New. + (vget_lane_bf16, vgetq_lane_bf16): New. + (vcreate_bf16): New. + (vdup_n_bf16, vdupq_n_bf16): New. + (vdup_lane_bf16, vdup_laneq_bf16): New. + (vdupq_lane_bf16, vdupq_laneq_bf16): New. + (vduph_lane_bf16, vduph_laneq_bf16): New. + (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. + (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. + (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. + (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. + (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. + (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. + (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. + (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. + (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. + (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. + (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New + (vreinterpret_bf16_f16, vreinterpretq_bf16_f16): New + (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. + (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. + (vreinterpretq_bf16_p128): New. + (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. + (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. + (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. + (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. + (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. + (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. + (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. + (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. + (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. + (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. + (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. + (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. + (vreinterpret_f64_bf16,vreinterpretq_f64_bf16): New. + (vreinterpret_f16_bf16,vreinterpretq_f16_bf16): New. + (vreinterpretq_p128_bf16): New. + 2020-02-25 Dennis Zhang * config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6a2220a..a4f2dd2 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34554,6 +34554,507 @@ vrnd64xq_f64 (float64x2_t __a) #pragma GCC push_options #pragma GCC target ("arch=armv8.2-a+bf16") +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index) +{ + return __aarch64_vset_lane_any (__elem, __vec, __index); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index) +{ + return __aarch64_vset_lane_any (__elem, __vec, __index); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vgetq_lane_bf16 (bfloat16x8_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcreate_bf16 (uint64_t __a) +{ + return (bfloat16x4_t) __a; +} + +/* vdup */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_bf16 (bfloat16_t __a) +{ + return (bfloat16x4_t) {__a, __a, __a, __a}; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_n_bf16 (bfloat16_t __a) +{ + return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +/* vreinterpret */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u8 (uint8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u16 (uint16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u32 (uint32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u64 (uint64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s8 (int8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s16 (int16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s32 (int32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s64 (int64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p8 (poly8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p16 (poly16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p64 (poly64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f16 (float16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f32 (float32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f64 (float64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u8 (uint8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u16 (uint16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u32 (uint32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u64 (uint64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s8 (int8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s16 (int16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s32 (int32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s64 (int64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p8 (poly8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p16 (poly16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p64 (poly64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p128 (poly128_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f16 (float16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f32 (float32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f64 (float64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s8_bf16 (bfloat16x4_t __a) +{ + return (int8x8_t)__a; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s16_bf16 (bfloat16x4_t __a) +{ + return (int16x4_t)__a; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s32_bf16 (bfloat16x4_t __a) +{ + return (int32x2_t)__a; +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s64_bf16 (bfloat16x4_t __a) +{ + return (int64x1_t)__a; +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u8_bf16 (bfloat16x4_t __a) +{ + return (uint8x8_t)__a; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u16_bf16 (bfloat16x4_t __a) +{ + return (uint16x4_t)__a; +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u32_bf16 (bfloat16x4_t __a) +{ + return (uint32x2_t)__a; +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u64_bf16 (bfloat16x4_t __a) +{ + return (uint64x1_t)__a; +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f16_bf16 (bfloat16x4_t __a) +{ + return (float16x4_t)__a; +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f32_bf16 (bfloat16x4_t __a) +{ + return (float32x2_t)__a; +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f64_bf16 (bfloat16x4_t __a) +{ + return (float64x1_t)__a; +} + +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p8_bf16 (bfloat16x4_t __a) +{ + return (poly8x8_t)__a; +} + +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p16_bf16 (bfloat16x4_t __a) +{ + return (poly16x4_t)__a; +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p64_bf16 (bfloat16x4_t __a) +{ + return (poly64x1_t)__a; +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s8_bf16 (bfloat16x8_t __a) +{ + return (int8x16_t)__a; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s16_bf16 (bfloat16x8_t __a) +{ + return (int16x8_t)__a; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s32_bf16 (bfloat16x8_t __a) +{ + return (int32x4_t)__a; +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s64_bf16 (bfloat16x8_t __a) +{ + return (int64x2_t)__a; +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u8_bf16 (bfloat16x8_t __a) +{ + return (uint8x16_t)__a; +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u16_bf16 (bfloat16x8_t __a) +{ + return (uint16x8_t)__a; +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u32_bf16 (bfloat16x8_t __a) +{ + return (uint32x4_t)__a; +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u64_bf16 (bfloat16x8_t __a) +{ + return (uint64x2_t)__a; +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f16_bf16 (bfloat16x8_t __a) +{ + return (float16x8_t)__a; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f32_bf16 (bfloat16x8_t __a) +{ + return (float32x4_t)__a; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f64_bf16 (bfloat16x8_t __a) +{ + return (float64x2_t)__a; +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p8_bf16 (bfloat16x8_t __a) +{ + return (poly8x16_t)__a; +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p16_bf16 (bfloat16x8_t __a) +{ + return (poly16x8_t)__a; +} + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p64_bf16 (bfloat16x8_t __a) +{ + return (poly64x2_t)__a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_bf16 (bfloat16x8_t __a) +{ + return (poly128_t)__a; +} + __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index b106957..571a5fa 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -139,7 +139,8 @@ (define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == DImode")]) ;; Advanced SIMD Float modes suitable for moving, loading and storing. -(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF]) +(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF + V4BF V8BF]) ;; Advanced SIMD Float modes. (define_mode_iterator VDQF [V2SF V4SF V2DF]) @@ -180,7 +181,7 @@ ;; All Advanced SIMD modes suitable for moving, loading, and storing. (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI - V4HF V8HF V2SF V4SF V2DF]) + V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) ;; All Advanced SIMD modes suitable for moving, loading, and storing, ;; including special Bfloat vector types. @@ -196,7 +197,7 @@ ;; All Advanced SIMD modes and DI. (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI - V4HF V8HF V2SF V4SF V2DF DI]) + V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI]) ;; All Advanced SIMD modes, plus DI and DF. (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI @@ -972,6 +973,7 @@ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b") (V4HI "4h") (V8HI "8h") + (V4BF "4h") (V8BF "8h") (V2SI "2s") (V4SI "4s") (DI "1d") (DF "1d") (V2DI "2d") (V2SF "2s") @@ -1015,6 +1017,7 @@ (VNx4SF "s") (VNx2SF "s") (VNx2DI "d") (VNx2DF "d") + (BF "h") (V4BF "h") (V8BF "h") (HF "h") (SF "s") (DF "d") (QI "b") (HI "h") @@ -1083,6 +1086,7 @@ (DF "DF") (V2DF "DF") (SI "SI") (HI "HI") (QI "QI") + (V4BF "BF") (V8BF "BF") (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI") (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF") @@ -1102,6 +1106,7 @@ (V2DF "df") (DF "df") (SI "si") (HI "hi") (QI "qi") + (V4BF "bf") (V8BF "bf") (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi") (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf") @@ -1422,6 +1427,7 @@ (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI") (V4HI "V8HI") (V8HI "V4HI") + (V8BF "V4BF") (V4BF "V8BF") (V2SI "V4SI") (V4SI "V2SI") (DI "V2DI") (V2DI "DI") (V2SF "V4SF") (V4SF "V2SF") @@ -1434,6 +1440,7 @@ (DI "to_128") (V2DI "to_64") (V4HF "to_128") (V8HF "to_64") (V2SF "to_128") (V4SF "to_64") + (V4BF "to_128") (V8BF "to_64") (DF "to_128") (V2DF "to_64")]) ;; For certain vector-by-element multiplication instructions we must @@ -1467,6 +1474,7 @@ ;; Defined to '_q' for 128-bit types. (define_mode_attr q [(V8QI "") (V16QI "_q") (V4HI "") (V8HI "_q") + (V4BF "") (V8BF "_q") (V2SI "") (V4SI "_q") (DI "") (V2DI "_q") (V4HF "") (V8HF "_q") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7b24b54..c942486 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2020-02-25 Mihail Ionescu + + * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test. + 2020-02-25 Dennis Zhang * gcc.target/arm/simd/bf16_dot_1.c: New test. diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c new file mode 100644 index 0000000..c42c7ac --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c @@ -0,0 +1,85 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-options "-O2" } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ + +#include + +float32x2_t test_vcreate (float32x2_t r, uint64_t a, uint64_t b) +{ + bfloat16x4_t _a = vcreate_bf16(a); + bfloat16x4_t _b = vcreate_bf16(b); + + return vbfdot_f32 (r, _a, _b); +} +/* { dg-final { scan-assembler {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} } } */ + +bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b) +{ + return vset_lane_bf16 (a, b, 3); +} + +bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b) +{ + return vsetq_lane_bf16 (a, b, 7); +} +/* { dg-final { scan-assembler-times "ins\\t" 2 } } */ + +bfloat16x4_t vdup_test (bfloat16_t a) +{ + return vdup_n_bf16 (a); +} +/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+.h\\\[0\\\]" } } */ + +bfloat16x8_t vdupq_test (bfloat16_t a) +{ + return vdupq_n_bf16 (a); +} + +bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a) +{ + return vdupq_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8h, v\[0-9\]+.h\\\[0\\\]" 2 } } */ + +bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a) +{ + return vget_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[1\\\]" 2 } } */ + +bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a) +{ + return vdup_lane_bf16 (a, 1); +} +/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" } } */ + +bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a) +{ + return vdup_laneq_bf16 (a, 7); +} +/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[7\\\]" } } */ + +bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a) +{ + return vdupq_laneq_bf16 (a, 5); +} +/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[5\\\]" } } */ + +bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a) +{ + return vduph_lane_bf16 (a, 3); +} +/* { dg-final { scan-assembler "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[3\\\]" } } */ + +bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a) +{ + return vgetq_lane_bf16 (a, 7); +} + +bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a) +{ + return vduph_laneq_bf16 (a, 7); +} +/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c new file mode 100644 index 0000000..f5adf40 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c @@ -0,0 +1,466 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ + +#include + +float32x2_t +test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s8(a); + bfloat16x4_t _b = vreinterpret_bf16_s8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s16(a); + bfloat16x4_t _b = vreinterpret_bf16_s16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s32(a); + bfloat16x4_t _b = vreinterpret_bf16_s32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_s64(a); + bfloat16x4_t _b = vreinterpret_bf16_s64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u8(a); + bfloat16x4_t _b = vreinterpret_bf16_u8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u16(a); + bfloat16x4_t _b = vreinterpret_bf16_u16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u32(a); + bfloat16x4_t _b = vreinterpret_bf16_u32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_u64(a); + bfloat16x4_t _b = vreinterpret_bf16_u64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p8(a); + bfloat16x4_t _b = vreinterpret_bf16_p8(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p16(a); + bfloat16x4_t _b = vreinterpret_bf16_p16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_p64(a); + bfloat16x4_t _b = vreinterpret_bf16_p64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f16(a); + bfloat16x4_t _b = vreinterpret_bf16_f16(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f32(a); + bfloat16x4_t _b = vreinterpret_bf16_f32(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x2_t +test_vbfdot_f32_f64 (float32x2_t r, float64x1_t a, float64x1_t b) +{ + bfloat16x4_t _a = vreinterpret_bf16_f64(a); + bfloat16x4_t _b = vreinterpret_bf16_f64(b); + + return vbfdot_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s8(a); + bfloat16x8_t _b = vreinterpretq_bf16_s8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s16(a); + bfloat16x8_t _b = vreinterpretq_bf16_s16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s32(a); + bfloat16x8_t _b = vreinterpretq_bf16_s32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_s64(a); + bfloat16x8_t _b = vreinterpretq_bf16_s64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u8(a); + bfloat16x8_t _b = vreinterpretq_bf16_u8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u16(a); + bfloat16x8_t _b = vreinterpretq_bf16_u16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u32(a); + bfloat16x8_t _b = vreinterpretq_bf16_u32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_u64(a); + bfloat16x8_t _b = vreinterpretq_bf16_u64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p8(a); + bfloat16x8_t _b = vreinterpretq_bf16_p8(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p16(a); + bfloat16x8_t _b = vreinterpretq_bf16_p16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p64(a); + bfloat16x8_t _b = vreinterpretq_bf16_p64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_p128(a); + bfloat16x8_t _b = vreinterpretq_bf16_p128(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f16(a); + bfloat16x8_t _b = vreinterpretq_bf16_f16(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f32(a); + bfloat16x8_t _b = vreinterpretq_bf16_f32(b); + + return vbfdotq_f32 (r, _a, _b); +} + +float32x4_t +test_vbfdotq_f32_f64 (float32x4_t r, float64x2_t a, float64x2_t b) +{ + bfloat16x8_t _a = vreinterpretq_bf16_f64(a); + bfloat16x8_t _b = vreinterpretq_bf16_f64(b); + + return vbfdotq_f32 (r, _a, _b); +} + +/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} 14 } } */ +/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h} 15 } } */ + +int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b) +{ + int8x8_t _a = vreinterpret_s8_bf16 (a); + return vadd_s8 (_a, b); +} + +int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b) +{ + int16x4_t _a = vreinterpret_s16_bf16 (a); + return vadd_s16 (_a, b); +} + +int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b) +{ + int32x2_t _a = vreinterpret_s32_bf16 (a); + return vadd_s32 (_a, b); +} + +int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b) +{ + int64x1_t _a = vreinterpret_s64_bf16 (a); + return vrshl_s64 (_a, b); +} + +uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b) +{ + uint8x8_t _a = vreinterpret_u8_bf16 (a); + return vadd_u8 (_a, b); +} + +uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b) +{ + uint16x4_t _a = vreinterpret_u16_bf16 (a); + return vadd_u16 (_a, b); +} + +uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b) +{ + uint32x2_t _a = vreinterpret_u32_bf16 (a); + return vadd_u32 (_a, b); +} + +uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b) +{ + uint64x1_t _a = vreinterpret_u64_bf16 (a); + return vrshl_u64 (_a, b); +} + +poly8x8_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b) +{ + poly8x8_t _a = vreinterpret_p8_bf16 (a); + return vzip1_p8 (_a, b); +} + +poly16x4_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b) +{ + poly16x4_t _a = vreinterpret_p16_bf16 (a); + return vzip1_p16 (_a, b); +} + +poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b) +{ + poly64x1_t _a = vreinterpret_p64_bf16 (a); + return vsli_n_p64 (_a, b, 3); +} + +float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b) +{ + float32x2_t _a = vreinterpret_f32_bf16 (a); + return vsub_f32 (_a, b); +} + +float64x1_t test_vreinterpret_f64_bf16 (bfloat16x4_t a, float64x1_t b) +{ + float64x1_t _a = vreinterpret_f64_bf16 (a); + return vsub_f64 (_a, b); +} + +int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b) +{ + int8x16_t _a = vreinterpretq_s8_bf16 (a); + return vaddq_s8 (_a, b); +} + +int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b) +{ + int16x8_t _a = vreinterpretq_s16_bf16 (a); + return vaddq_s16 (_a, b); +} + +int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b) +{ + int32x4_t _a = vreinterpretq_s32_bf16 (a); + return vaddq_s32 (_a, b); +} + +int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b) +{ + int64x2_t _a = vreinterpretq_s64_bf16 (a); + return vaddq_s64 (_a, b); +} + +uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b) +{ + uint8x16_t _a = vreinterpretq_u8_bf16 (a); + return vaddq_u8 (_a, b); +} + +uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b) +{ + uint16x8_t _a = vreinterpretq_u16_bf16 (a); + return vaddq_u16 (_a, b); +} + +uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b) +{ + uint32x4_t _a = vreinterpretq_u32_bf16 (a); + return vaddq_u32 (_a, b); +} + +uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b) +{ + uint64x2_t _a = vreinterpretq_u64_bf16 (a); + return vaddq_u64 (_a, b); +} + +poly8x16_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b) +{ + poly8x16_t _a = vreinterpretq_p8_bf16 (a); + return vzip1q_p8 (_a, b); +} + +poly16x8_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b) +{ + poly16x8_t _a = vreinterpretq_p16_bf16 (a); + return vzip1q_p16 (_a, b); +} + +poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b) +{ + poly64x2_t _a = vreinterpretq_p64_bf16 (a); + return vsliq_n_p64 (_a, b, 3); +} + +poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b) +{ + poly128_t _a = vreinterpretq_p128_bf16 (a); + return _a; +} + +float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b) +{ + float32x4_t _a = vreinterpretq_f32_bf16 (a); + return vsubq_f32 (_a, b); +} + +float64x2_t test_vreinterpretq_f64_bf16 (bfloat16x8_t a, float64x2_t b) +{ + float64x2_t _a = vreinterpretq_f64_bf16 (a); + return vsubq_f64 (_a, b); +} + +float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a) +{ + return vreinterpret_f16_bf16 (a); +} + +float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a) +{ + return vreinterpretq_f16_bf16 (a); +} + +/* { dg-final { scan-assembler-times {add\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} 2 } } */ +/* { dg-final { scan-assembler-times {add\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} 2 } } */ +/* { dg-final { scan-assembler-times {add\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} 2 } } */ + +/* { dg-final { scan-assembler-times {add\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} 2 } } */ +/* { dg-final { scan-assembler-times {add\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} 2 } } */ +/* { dg-final { scan-assembler-times {add\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} 2 } } */ + +/* { dg-final { scan-assembler {fsub\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} } } */ +/* { dg-final { scan-assembler {fsub\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} } } */ +/* { dg-final { scan-assembler {fsub\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d} } } */ +/* { dg-final { scan-assembler {fsub\td[0-9]+, d[0-9]+, d[0-9]+} } } */ + +/* { dg-final { scan-assembler {zip1\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} } } */ +/* { dg-final { scan-assembler {zip1\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} } } */ +/* { dg-final { scan-assembler {zip1\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} } } */ +/* { dg-final { scan-assembler {zip1\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} } } */ + +/* { dg-final { scan-assembler {sli\tv[0-9]+.2d, v[0-9]+.2d, 3} } } */ +/* { dg-final { scan-assembler {sli\td[0-9]+, d[0-9]+, 3} } } */ + +/* { dg-final { scan-assembler {urshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */ +/* { dg-final { scan-assembler {srshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */ -- 2.7.4