From ff229375721d1763a18ec76403aa1215b2932fb3 Mon Sep 17 00:00:00 2001 From: Delia Burduv Date: Fri, 6 Mar 2020 10:32:20 +0000 Subject: [PATCH] ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32 2020-03-06 Delia Burduv * config/arm/arm_neon.h (bfloat16x4x2_t): New typedef. (bfloat16x8x2_t): New typedef. (bfloat16x4x3_t): New typedef. (bfloat16x8x3_t): New typedef. (bfloat16x4x4_t): New typedef. (bfloat16x8x4_t): New typedef. (vst2_bf16): New. (vst2q_bf16): New. (vst3_bf16): New. (vst3q_bf16): New. (vst4_bf16): New. (vst4q_bf16): New. * config/arm/arm-builtins.c (v2bf_UP): Define. (VAR13): New. (arm_init_simd_builtin_types): Init Bfloat16x2_t eltype. * config/arm/arm-modes.def (V2BF): New mode. * config/arm/arm-simd-builtin-types.def (Bfloat16x2_t): New entry. * config/arm/arm_neon_builtins.def (vst2): Changed to VAR13 and added v4bf, v8bf (vst3): Changed to VAR13 and added v4bf, v8bf (vst4): Changed to VAR13 and added v4bf, v8bf * config/arm/iterators.md (VDXBF): New iterator. (VQ2BF): New iterator. *config/arm/neon.md (neon_vst2): Used new iterators. (neon_vst2): Used new iterators. (neon_vst3): Used new iterators. (neon_vst3): Used new iterators. (neon_vst3qa): Used new iterators. (neon_vst3qb): Used new iterators. (neon_vst4): Used new iterators. (neon_vst4): Used new iterators. (neon_vst4qa): Used new iterators. (neon_vst4qb): Used new iterators. * gcc.target/arm/simd/bf16_vstn_1.c: New test. --- gcc/ChangeLog | 37 +++++++++++ gcc/config/arm/arm-builtins.c | 5 ++ gcc/config/arm/arm-modes.def | 1 + gcc/config/arm/arm-simd-builtin-types.def | 1 + gcc/config/arm/arm_neon.h | 78 +++++++++++++++++++++++ gcc/config/arm/arm_neon_builtins.def | 12 ++-- gcc/config/arm/iterators.md | 6 ++ gcc/config/arm/neon.md | 20 +++--- gcc/testsuite/ChangeLog | 4 ++ gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c | 84 +++++++++++++++++++++++++ 10 files changed, 232 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a5b04ab..6d2a35c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,42 @@ 2020-03-06 Delia Burduv + * config/arm/arm_neon.h (bfloat16x4x2_t): New typedef. + (bfloat16x8x2_t): New typedef. + (bfloat16x4x3_t): New typedef. + (bfloat16x8x3_t): New typedef. + (bfloat16x4x4_t): New typedef. + (bfloat16x8x4_t): New typedef. + (vst2_bf16): New. + (vst2q_bf16): New. + (vst3_bf16): New. + (vst3q_bf16): New. + (vst4_bf16): New. + (vst4q_bf16): New. + * config/arm/arm-builtins.c (v2bf_UP): Define. + (VAR13): New. + (arm_init_simd_builtin_types): Init Bfloat16x2_t eltype. + * config/arm/arm-modes.def (V2BF): New mode. + * config/arm/arm-simd-builtin-types.def + (Bfloat16x2_t): New entry. + * config/arm/arm_neon_builtins.def + (vst2): Changed to VAR13 and added v4bf, v8bf + (vst3): Changed to VAR13 and added v4bf, v8bf + (vst4): Changed to VAR13 and added v4bf, v8bf + * config/arm/iterators.md (VDXBF): New iterator. + (VQ2BF): New iterator. + *config/arm/neon.md (neon_vst2): Used new iterators. + (neon_vst2): Used new iterators. + (neon_vst3): Used new iterators. + (neon_vst3): Used new iterators. + (neon_vst3qa): Used new iterators. + (neon_vst3qb): Used new iterators. + (neon_vst4): Used new iterators. + (neon_vst4): Used new iterators. + (neon_vst4qa): Used new iterators. + (neon_vst4qb): Used new iterators. + +2020-03-06 Delia Burduv + * config/aarch64/aarch64-simd-builtins.def (bfcvtn): New built-in function. (bfcvtn_q): New built-in function. diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 4d31405..e0561c5 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -342,6 +342,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define v4bf_UP E_V4BFmode #define v2si_UP E_V2SImode #define v2sf_UP E_V2SFmode +#define v2bf_UP E_V2BFmode #define di_UP E_DImode #define v16qi_UP E_V16QImode #define v8hi_UP E_V8HImode @@ -405,6 +406,9 @@ typedef struct { #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \ VAR1 (T, N, L) +#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \ + VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ + VAR1 (T, N, M) /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def and arm_acle_builtins.def. The entries in arm_neon_builtins.def require @@ -1037,6 +1041,7 @@ arm_init_simd_builtin_types (void) arm_simd_types[Float32x4_t].eltype = float_type_node; /* Init Bfloat vector types with underlying __bf16 scalar type. */ + arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node; diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index ea92ef3..6e48223 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ FLOAT_MODE (BF, 2, 0); ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format); +VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def index ea3c9f9..e35bb76 100644 --- a/gcc/config/arm/arm-simd-builtin-types.def +++ b/gcc/config/arm/arm-simd-builtin-types.def @@ -48,5 +48,6 @@ ENTRY (Float16x8_t, V8HF, none, 128, float16, 19) ENTRY (Float32x4_t, V4SF, none, 128, float32, 19) + ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 1974967..4ab79d5 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19382,6 +19382,36 @@ vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, #pragma GCC push_options #pragma GCC target ("arch=armv8.2-a+bf16") +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvt_f32_bf16 (bfloat16x4_t __a) @@ -19479,6 +19509,54 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, return __builtin_neon_vfmat_laneqv8bf (__r, __a, __b, __index); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_bf16 (bfloat16_t * __ptr, bfloat16x4x2_t __val) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __val }; + return __builtin_neon_vst2v4bf (__ptr, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_bf16 (bfloat16_t * __ptr, bfloat16x8x2_t __val) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __val }; + return __builtin_neon_vst2v8bf (__ptr, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_bf16 (bfloat16_t * __ptr, bfloat16x4x3_t __val) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __val }; + return __builtin_neon_vst3v4bf (__ptr, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_bf16 (bfloat16_t * __ptr, bfloat16x8x3_t __val) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __val }; + return __builtin_neon_vst3v8bf (__ptr, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_bf16 (bfloat16_t * __ptr, bfloat16x4x4_t __val) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __val }; + return __builtin_neon_vst4v4bf (__ptr, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __val }; + return __builtin_neon_vst4v8bf (__ptr, __bu.__o); +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 38c8bb0..b73b3e5 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -325,8 +325,8 @@ VAR11 (LOAD1, vld2, VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) -VAR11 (STORE1, vst2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (STORE1, vst2, + v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR11 (LOAD1, vld3, @@ -334,8 +334,8 @@ VAR11 (LOAD1, vld3, VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) -VAR11 (STORE1, vst3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (STORE1, vst3, + v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR11 (LOAD1, vld4, @@ -343,8 +343,8 @@ VAR11 (LOAD1, vld4, VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) -VAR11 (STORE1, vst4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (STORE1, vst4, + v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR2 (TERNOP, sdot, v8qi, v16qi) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 8314001..8ff3c15 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -84,6 +84,9 @@ ;; Double-width vector modes plus 64-bit elements. (define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements, including V4BF. +(define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) @@ -100,6 +103,9 @@ ;; Quad-width vector modes, including V8HF. (define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF]) +;; Quad-width vector modes, including V8BF. +(define_mode_iterator VQ2BF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF]) + ;; Quad-width vector modes with 16- or 32-bit elements (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 75cc31a..250d578 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5541,7 +5541,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst2" [(set (match_operand:TI 0 "neon_struct_operand" "=Um") (unspec:TI [(match_operand:TI 1 "s_register_operand" "w") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2))] "TARGET_NEON" { @@ -5566,7 +5566,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst2" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2))] "TARGET_NEON" "vst2.\t%h1, %A0" @@ -5810,7 +5810,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst3" [(set (match_operand:EI 0 "neon_struct_operand" "=Um") (unspec:EI [(match_operand:EI 1 "s_register_operand" "w") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3))] "TARGET_NEON" { @@ -5837,7 +5837,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vst3" [(match_operand:CI 0 "neon_struct_operand") (match_operand:CI 1 "s_register_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5852,7 +5852,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst3qa" [(set (match_operand:EI 0 "neon_struct_operand" "=Um") (unspec:EI [(match_operand:CI 1 "s_register_operand" "w") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3A))] "TARGET_NEON" { @@ -5871,7 +5871,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst3qb" [(set (match_operand:EI 0 "neon_struct_operand" "=Um") (unspec:EI [(match_operand:CI 1 "s_register_operand" "w") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3B))] "TARGET_NEON" { @@ -6135,7 +6135,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst4" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4))] "TARGET_NEON" { @@ -6163,7 +6163,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vst4" [(match_operand:XI 0 "neon_struct_operand") (match_operand:XI 1 "s_register_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -6178,7 +6178,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst4qa" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:XI 1 "s_register_operand" "w") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4A))] "TARGET_NEON" { @@ -6198,7 +6198,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst4qb" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:XI 1 "s_register_operand" "w") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4B))] "TARGET_NEON" { diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6c9206a..b7bbb47 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2020-03-06 Delia Burduv + + * gcc.target/arm/simd/bf16_vstn_1.c: New test. + 2020-03-06 Kito Cheng * gcc.target/riscv/pr93304.c: Update expected output and comment. diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c new file mode 100644 index 0000000..2657b6f --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c @@ -0,0 +1,84 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +**test_vst2_bf16: +** ... +** vst2.16 {d0-d1}, \[r0\] +** bx lr +*/ +void +test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val) +{ + vst2_bf16 (ptr, val); +} + +/* +**test_vst2q_bf16: +** ... +** vst2.16 {d0-d3}, \[r0\] +** bx lr +*/ +void +test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val) +{ + vst2q_bf16 (ptr, val); +} + +/* +**test_vst3_bf16: +** ... +** vst3.16 {d0-d2}, \[r0\] +** bx lr +*/ +void +test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val) +{ + vst3_bf16 (ptr, val); +} + +/* +**test_vst3q_bf16: +** ... +** vst3.16 {d17, d19, d21}, \[r0\] +** bx lr +*/ +void +test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val) +{ + vst3q_bf16 (ptr, val); +} + +/* +**test_vst4_bf16: +** ... +** vst4.16 {d0-d3}, \[r0\] +** bx lr +*/ +void +test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val) +{ + vst4_bf16 (ptr, val); +} + +/* +**test_vst4q_bf16: +** ... +** vst4.16 {d1, d3, d5, d7}, \[r0\] +** bx lr +*/ +void +test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val) +{ + vst4q_bf16 (ptr, val); +} + +int main() +{ + return 0; +} -- 2.7.4