From f275d73a57f1e5a07fbd4978f4b4457a5eaa1e39 Mon Sep 17 00:00:00 2001 From: Stam Markianos-Wright Date: Thu, 16 Jan 2020 14:47:30 +0000 Subject: [PATCH] [GCC][PATCH][AArch64]Add ACLE intrinsics for bfdot for ARMv8.6 Extension 2020-01-16 Stam Markianos-Wright * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot, aarch64_bfdot_lane, aarch64_bfdot_laneq): New. * config/aarch64/aarch64-simd.md (aarch64_bfdot, aarch64_bfdot_lane, aarch64_bfdot_laneq): New. * config/aarch64/arm_bf16.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32, vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New. * config/aarch64/iterators.md (UNSPEC_BFDOT, Vbfdottype, VBFMLA_W, VBF): New. (isquadop): Add V4BF, V8BF. 2020-01-16 Stam Markianos-Wright * gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c: New. * gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c: New. * gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c: New. --- gcc/ChangeLog | 13 ++++ gcc/config/aarch64/aarch64-simd-builtins.def | 5 ++ gcc/config/aarch64/aarch64-simd.md | 32 ++++++++ gcc/config/aarch64/arm_neon.h | 51 ++++++++++++ gcc/config/aarch64/iterators.md | 12 ++- gcc/testsuite/ChangeLog | 14 +++- .../aarch64/advsimd-intrinsics/bfdot-1.c | 91 ++++++++++++++++++++++ .../aarch64/advsimd-intrinsics/bfdot-2.c | 91 ++++++++++++++++++++++ .../aarch64/advsimd-intrinsics/bfdot-3.c | 28 +++++++ 9 files changed, 332 insertions(+), 5 deletions(-) create mode 100755 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c create mode 100755 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c create mode 100755 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 49dcecb..d11b8d3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,18 @@ 2020-01-16 Stam Markianos-Wright + * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot, + aarch64_bfdot_lane, aarch64_bfdot_laneq): New. + * config/aarch64/aarch64-simd.md (aarch64_bfdot, aarch64_bfdot_lane, + aarch64_bfdot_laneq): New. + * config/aarch64/arm_bf16.h (vbfdot_f32, vbfdotq_f32, + vbfdot_lane_f32, vbfdotq_lane_f32, vbfdot_laneq_f32, + vbfdotq_laneq_f32): New. + * config/aarch64/iterators.md (UNSPEC_BFDOT, Vbfdottype, + VBFMLA_W, VBF): New. + (isquadop): Add V4BF, V8BF. + +2020-01-16 Stam Markianos-Wright + * config/aarch64/aarch64-builtins.c: (enum aarch64_type_qualifiers): New qualifier_lane_quadtup_index, TYPES_TERNOP_SSUS, TYPES_QUADOPSSUS_LANE_QUADTUP, TYPES_QUADOPSSSU_LANE_QUADTUP. diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 4744dd1..a118f4f 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -687,3 +687,8 @@ BUILTIN_VSFDF (UNOP, frint32x, 0) BUILTIN_VSFDF (UNOP, frint64z, 0) BUILTIN_VSFDF (UNOP, frint64x, 0) + + /* Implemented by aarch64_bfdot{_lane}{q}. */ + VAR2 (TERNOP, bfdot, 0, v2sf, v4sf) + VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf) + VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 9e56e8c..97f46f9 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7059,3 +7059,35 @@ "xtn\t%0., %1." [(set_attr "type" "neon_shift_imm_narrow_q")] ) + +(define_insn "aarch64_bfdot" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (plus:VDQSF + (unspec:VDQSF + [(match_operand: 2 "register_operand" "w") + (match_operand: 3 "register_operand" "w")] + UNSPEC_BFDOT) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_BF16_SIMD" + "bfdot\t%0., %2., %3." + [(set_attr "type" "neon_dot")] +) + +(define_insn "aarch64_bfdot_lane" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (plus:VDQSF + (unspec:VDQSF + [(match_operand: 2 "register_operand" "w") + (match_operand:VBF 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + UNSPEC_BFDOT) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_BF16_SIMD" +{ + int nunits = GET_MODE_NUNITS (mode).to_constant (); + int lane = INTVAL (operands[4]); + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); + return "bfdot\t%0., %2., %3.2h[%4]"; +} + [(set_attr "type" "neon_dot")] +) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index c962140..7f05c3f 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a) #include "arm_bf16.h" +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); +} + +#pragma GCC pop_options + /* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ #pragma GCC push_options diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 83720d9..661c3e7 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -122,6 +122,9 @@ ;; Quad vector with only 2 element modes. (define_mode_iterator VQ_2E [V2DI V2DF]) +;; BFmode vector modes. +(define_mode_iterator VBF [V4BF V8BF]) + ;; This mode iterator allows :P to be used for patterns that operate on ;; addresses in different modes. In LP64, only DI will match, while in ;; ILP32, either can match. @@ -801,6 +804,7 @@ UNSPEC_USUBWT ; Used in aarch64-sve2.md. UNSPEC_USDOT ; Used in aarch64-simd.md. UNSPEC_SUDOT ; Used in aarch64-simd.md. + UNSPEC_BFDOT ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ @@ -1451,6 +1455,9 @@ ;; Register suffix for DOTPROD input types from the return type. (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")]) +;; Register suffix for BFDOT input types from the return type. +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")]) + ;; Sum of lengths of instructions needed to move vector registers of a mode. (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")]) @@ -1461,11 +1468,14 @@ ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")]) +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")]) + (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")]) (define_mode_attr f16quad [(V2SF "") (V4SF "q")]) -(define_mode_attr isquadop [(V8QI "") (V16QI "q")]) +(define_mode_attr isquadop [(V8QI "") (V16QI "q") (V4BF "") (V8BF "q")]) (define_code_attr f16mac [(plus "a") (minus "s")]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 8b01aa0..e5963d2 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,9 +1,15 @@ 2020-01-16 Stam Markianos-Wright - * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c: New test. - * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c: New test. - * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c: New test. - * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c: New. + * gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c: New. + * gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c: New. + +2020-01-16 Stam Markianos-Wright + + * gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c: New test. 2020-01-16 Andre Vieira diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c new file mode 100755 index 0000000..ad51507 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c @@ -0,0 +1,91 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ + +#include + +/* +**ufoo: +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) +** ret +*/ +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq: +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) +** ret +*/ +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_f32 (r, x, y); +} + +/* +**ufoo_lane: +** bfdot v0.2s, v1.4h, v2.2h\[0\] +** ret +*/ +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, 0); +} + +/* +**ufooq_laneq: +** bfdot v0.4s, v1.8h, v2.2h\[2\] +** ret +*/ +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, 2); +} + +/* +**ufoo_laneq: +** bfdot v0.2s, v1.4h, v2.2h\[3\] +** ret +*/ +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 3); +} + +/* +**ufooq_lane: +** bfdot v0.4s, v1.8h, v2.2h\[1\] +** ret +*/ +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + +/* +**ufoo_untied: +** mov v0.8b, v1.8b +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) +** ret +*/ +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq_lane_untied: +** mov v0.16b, v1.16b +** bfdot v0.4s, v2.8h, v3.2h\[1\] +** ret +*/ +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c new file mode 100755 index 0000000..58bdee5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c @@ -0,0 +1,91 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-mbig-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ + +#include + +/* +**ufoo: +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) +** ret +*/ +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq: +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) +** ret +*/ +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_f32 (r, x, y); +} + +/* +**ufoo_lane: +** bfdot v0.2s, v1.4h, v2.2h\[0\] +** ret +*/ +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, 0); +} + +/* +**ufooq_laneq: +** bfdot v0.4s, v1.8h, v2.2h\[2\] +** ret +*/ +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, 2); +} + +/* +**ufoo_laneq: +** bfdot v0.2s, v1.4h, v2.2h\[3\] +** ret +*/ +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 3); +} + +/* +**ufooq_lane: +** bfdot v0.4s, v1.8h, v2.2h\[1\] +** ret +*/ +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + +/* +**ufoo_untied: +** mov v0.8b, v1.8b +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) +** ret +*/ +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq_lane_untied: +** mov v0.16b, v1.16b +** bfdot v0.4s, v2.8h, v3.2h\[1\] +** ret +*/ +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c new file mode 100755 index 0000000..6071262 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c @@ -0,0 +1,28 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "--save-temps" } */ + +#include + +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */ +} + +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */ +} + +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */ +} + +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */ +} + -- 2.7.4