From f2d131645114f14bd91a60107c941287370650ea Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Mon, 7 Feb 2022 12:55:12 +0000 Subject: [PATCH] AArch32: correct usdot-product RTL patterns. There was a bug in the ACLE specication for dot product which has now been fixed[1]. This means some intrinsics were missing and are added by this patch. Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. Ok for master? [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 gcc/ChangeLog: * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32, vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New * config/arm/arm_neon_builtins.def (usdot): Add V16QI. (usdot_laneq, sudot_laneq): New. * config/arm/neon.md (neon_dot_laneq): New. (neon_dot_lane): Remote unneeded code. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/vdot-2-1.c: Add new tests. * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output. --- gcc/config/arm/arm_neon.h | 39 ++++++++++++++++++ gcc/config/arm/arm_neon_builtins.def | 4 +- gcc/config/arm/neon.md | 28 ++++++++++++- gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c | 59 ++++++++++++++++++++++++++- gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c | 60 +++++++++++++++++++++++++++- 5 files changed, 185 insertions(+), 5 deletions(-) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index fdfea33..b30d04c 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); } +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); +} + __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index); } +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, + int8x16_t __b, const int __index) +{ + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, + int8x16_t __b, const int __index) +{ + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, + uint8x16_t __b, const int __index) +{ + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, + uint8x16_t __b, const int __index) +{ + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index); +} + #pragma GCC pop_options #pragma GCC pop_options diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index c29ae3a..445b2bf 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) -VAR1 (USTERNOP, usdot, v8qi) +VAR2 (USTERNOP, usdot, v8qi, v16qi) VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi) +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi) +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi) VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf) VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 4a8987b..2b9a3de 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2977,9 +2977,33 @@ DOTPROD_I8MM) (match_operand:VCVTI 1 "register_operand" "0")))] "TARGET_I8MM" + "vdot.\\t%0, %2, %P3[%c4]" + [(set_attr "type" "neon_dot")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations in the v8.6 I8MM extension. +(define_insn "neon_dot_laneq" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD_I8MM) + (match_operand:VCVTI 1 "register_operand" "0")))] + "TARGET_I8MM" { - operands[4] = GEN_INT (INTVAL (operands[4])); - return "vdot.\\t%0, %2, %P3[%c4]"; + int lane = INTVAL (operands[4]); + if (lane > GET_MODE_NUNITS (V2SImode) - 1) + { + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); + return "vdot.\\t%0, %2, %f3[%c4]"; + } + else + { + operands[4] = GEN_INT (lane); + return "vdot.\\t%0, %2, %e3[%c4]"; + } } [(set_attr "type" "neon_dot")] ) diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c index 88b80cf..35d713f 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c @@ -2,7 +2,7 @@ /* { dg-require-effective-target arm_hard_ok } */ /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ /* { dg-add-options arm_v8_2a_i8mm } */ -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */ +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include @@ -21,6 +21,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) } /* +**usfooq: +** ... +** vusdot\.s8 q0, q1, q2 +** bx lr +*/ +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_s32 (r, x, y); +} + +/* **usfoo_lane: ** ... ** vusdot\.s8 d0, d1, d2\[0\] @@ -67,6 +78,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) } /* +**usfoo_laneq: +** ... +** vusdot\.s8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) +{ + return vusdot_laneq_s32 (r, x, y, 2); +} + +/* +**usfooq_laneq: +** ... +** vusdot\.s8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_laneq_s32 (r, x, y, 3); +} + +/* Signed-Unsigned Dot Product instructions. */ + +/* +**sfoo_laneq: +** ... +** vsudot\.u8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) +{ + return vsudot_laneq_s32 (r, x, y, 2); +} + +/* +**sfooq_laneq: +** ... +** vsudot\.u8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) +{ + return vsudotq_laneq_s32 (r, x, y, 3); +} + +/* **usfoo_untied: ** ... ** vusdot\.s8 d1, d2, d3 diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c index 1c74718..c57dd42 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c @@ -2,7 +2,7 @@ /* { dg-require-effective-target arm_hard_ok } */ /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ /* { dg-add-options arm_v8_2a_i8mm } */ -/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } */ +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian -mfpu=auto" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include @@ -21,6 +21,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) } /* +**usfooq: +** ... +** vusdot\.s8 q0, q1, q2 +** bx lr +*/ +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_s32 (r, x, y); +} + +/* **usfoo_lane: ** ... ** vusdot\.s8 d0, d1, d2\[0\] @@ -67,6 +78,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) } /* +**usfoo_laneq: +** ... +** vusdot\.s8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) +{ + return vusdot_laneq_s32 (r, x, y, 2); +} + +/* +**usfooq_laneq: +** ... +** vusdot\.s8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_laneq_s32 (r, x, y, 3); +} + +/* Signed-Unsigned Dot Product instructions. */ + +/* +**sfoo_laneq: +** ... +** vsudot\.u8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) +{ + return vsudot_laneq_s32 (r, x, y, 2); +} + +/* +**sfooq_laneq: +** ... +** vsudot\.u8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) +{ + return vsudotq_laneq_s32 (r, x, y, 3); +} + +/* **usfoo_untied: ** ... ** vusdot\.s8 d1, d2, d3 @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_ { return vusdot_lane_s32 (r, x, y, 0); } + -- 2.7.4