From eb2b36024c94bc32465777927092cdbdf2d95204 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 8 Feb 2021 16:50:30 +0000 Subject: [PATCH] aarch64: Use RTL builtins for vpaddq intrinsics Rewrite vpaddq Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-08 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Use VDQ_I iterator for aarch64_addp builtin macro generator. * config/aarch64/aarch64-simd.md: Use VDQ_I iterator in aarch64_addp RTL pattern. * config/aarch64/arm_neon.h (vpaddq_s8): Use RTL builtin instead of inline asm. (vpaddq_s16): Likewise. (vpaddq_s32): Likewise. (vpaddq_s64): Likewise. (vpaddq_u8): Likewise. (vpaddq_u16): Likewise. (vpaddq_u32): Likewise. (vpaddq_u64): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 +- gcc/config/aarch64/aarch64-simd.md | 8 ++-- gcc/config/aarch64/arm_neon.h | 60 ++++++---------------------- 3 files changed, 17 insertions(+), 53 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f79e716..92804e0 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -48,7 +48,7 @@ BUILTIN_VB (BINOP, pmul, 0, NONE) BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) - BUILTIN_VD_BHSI (BINOP, addp, 0, NONE) + BUILTIN_VDQ_I (BINOP, addp, 0, NONE) VAR1 (UNOP, addp, 0, NONE, di) BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, NONE) BUILTIN_VDQ_BHSI (UNOP, clz, 2, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 5245cf0..60e11c6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6004,10 +6004,10 @@ ;; addp (define_insn "aarch64_addp" - [(set (match_operand:VD_BHSI 0 "register_operand" "=w") - (unspec:VD_BHSI - [(match_operand:VD_BHSI 1 "register_operand" "w") - (match_operand:VD_BHSI 2 "register_operand" "w")] + [(set (match_operand:VDQ_I 0 "register_operand" "=w") + (unspec:VDQ_I + [(match_operand:VDQ_I 1 "register_operand" "w") + (match_operand:VDQ_I 2 "register_operand" "w")] UNSPEC_ADDP))] "TARGET_SIMD" "addp\t%0, %1, %2" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 5fb2b3d..52f3714 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8673,96 +8673,60 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16_t __result; - __asm__ ("addp %0.16b,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8_t __result; - __asm__ ("addp %0.8h,%1.8h,%2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4_t __result; - __asm__ ("addp %0.4s,%1.4s,%2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv4si (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_s64 (int64x2_t __a, int64x2_t __b) { - int64x2_t __result; - __asm__ ("addp %0.2d,%1.2d,%2.2d" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_addpv2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t __result; - __asm__ ("addp %0.16b,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint8x16_t) __builtin_aarch64_addpv16qi ((int8x16_t) __a, + (int8x16_t) __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8_t __result; - __asm__ ("addp %0.8h,%1.8h,%2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint16x8_t) __builtin_aarch64_addpv8hi ((int16x8_t) __a, + (int16x8_t) __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4_t __result; - __asm__ ("addp %0.4s,%1.4s,%2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint32x4_t) __builtin_aarch64_addpv4si ((int32x4_t) __a, + (int32x4_t) __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpaddq_u64 (uint64x2_t __a, uint64x2_t __b) { - uint64x2_t __result; - __asm__ ("addp %0.2d,%1.2d,%2.2d" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (uint64x2_t) __builtin_aarch64_addpv2di ((int64x2_t) __a, + (int64x2_t) __b); } __extension__ extern __inline int16x4_t -- 2.7.4