From 79db5945ade4480d7500dac6bea64f49331e0cde Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 7 Jan 2021 15:09:43 +0000 Subject: [PATCH] aarch64: Reimplement vabd* intrinsics using builtins This patch reimplements the vabd* intrinsics using RTL builtins. It's fairly straightforward with new builtins + arm_neon.h changes. gcc/ * config/aarch64/aarch64-simd.md (aarch64_abd_3): Rename to... (aarch64_abd): ... This. (sadv16qi): Adjust callsite of the above. * config/aarch64/aarch64-simd-builtins.def (sabd, uabd): Define builtins. * config/aarch64/arm_neon.h (vabd_s8): Reimplement using builtin. (vabd_s16): Likewise. (vabd_s32): Likewise. (vabd_u8): Likewise. (vabd_u16): Likewise. (vabd_u32): Likewise. (vabdq_s8): Likewise. (vabdq_s16): Likewise. (vabdq_s32): Likewise. (vabdq_u8): Likewise. (vabdq_u16): Likewise. (vabdq_u32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 ++ gcc/config/aarch64/aarch64-simd.md | 4 +- gcc/config/aarch64/arm_neon.h | 84 ++++------------------------ 3 files changed, 18 insertions(+), 74 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 3cc8e09..ca44b65 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -149,6 +149,10 @@ BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE) BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE) + /* Implemented by aarch64_abd. */ + BUILTIN_VDQ_BHSI (BINOP, sabd, 0, NONE) + BUILTIN_VDQ_BHSI (BINOPU, uabd, 0, NONE) + /* Implemented by aarch64_aba. */ BUILTIN_VDQ_BHSI (TERNOP, saba, 0, NONE) BUILTIN_VDQ_BHSI (TERNOPU, uaba, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index d23398e..f52cd7c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -766,7 +766,7 @@ ;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64. ;; Whereas SABD would return 192 (-64 signed) on the above example. ;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead. -(define_insn "aarch64_abd_3" +(define_insn "aarch64_abd" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (minus:VDQ_BHSI (USMAX:VDQ_BHSI @@ -842,7 +842,7 @@ { rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); - emit_insn (gen_aarch64_abdv16qi_3 (abd, operands[1], operands[2])); + emit_insn (gen_aarch64_abdv16qi (abd, operands[1], operands[2])); emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], abd, ones)); DONE; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 3819ed3..77f9171 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -6849,72 +6849,42 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_s8 (int8x8_t __a, int8x8_t __b) { - int8x8_t __result; - __asm__ ("sabd %0.8b, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_s16 (int16x4_t __a, int16x4_t __b) { - int16x4_t __result; - __asm__ ("sabd %0.4h, %1.4h, %2.4h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_s32 (int32x2_t __a, int32x2_t __b) { - int32x2_t __result; - __asm__ ("sabd %0.2s, %1.2s, %2.2s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv2si (__a, __b); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t __result; - __asm__ ("uabd %0.8b, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv8qi_uuu (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_u16 (uint16x4_t __a, uint16x4_t __b) { - uint16x4_t __result; - __asm__ ("uabd %0.4h, %1.4h, %2.4h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv4hi_uuu (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_u32 (uint32x2_t __a, uint32x2_t __b) { - uint32x2_t __result; - __asm__ ("uabd %0.2s, %1.2s, %2.2s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv2si_uuu (__a, __b); } __extension__ extern __inline int16x8_t @@ -7065,72 +7035,42 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16_t __result; - __asm__ ("sabd %0.16b, %1.16b, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8_t __result; - __asm__ ("sabd %0.8h, %1.8h, %2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4_t __result; - __asm__ ("sabd %0.4s, %1.4s, %2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv4si (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t __result; - __asm__ ("uabd %0.16b, %1.16b, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv16qi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8_t __result; - __asm__ ("uabd %0.8h, %1.8h, %2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv8hi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4_t __result; - __asm__ ("uabd %0.4s, %1.4s, %2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv4si_uuu (__a, __b); } __extension__ extern __inline int16_t -- 2.7.4