From: Tamar Christina Date: Wed, 14 Jul 2021 14:23:23 +0000 (+0100) Subject: AArch64: Correct dot-product auto-vect optab RTL X-Git-Tag: upstream/12.2.0~6395 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6d1cdb27828d2ef1ae1ab0209836646a269b9610;p=platform%2Fupstream%2Fgcc.git AArch64: Correct dot-product auto-vect optab RTL The current RTL for the vectorizer patterns for dot-product are incorrect. Operand3 isn't an output parameter so we can't write to it. This fixes this issue and reduces the number of RTL. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (udot, sdot): Rename to... (sdot_prod, udot_prod): ...These. * config/aarch64/aarch64-simd.md (dot_prod): Remove. (aarch64_dot): Rename to... (dot_prod): ...This. * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): Update builtins. --- diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 063f503..99e7348 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by _prod. */ - BUILTIN_VB (TERNOP, sdot, 0, NONE) - BUILTIN_VB (TERNOPU, udot, 0, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7489098..88fa5ba 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,8 +587,28 @@ DONE; }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "aarch64_dot" +;; These expands map to the Dot Product optab the vectorizer checks for +;; and to the intrinsics patttern. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; idot_prod" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (match_operand:VS 1 "register_operand" "0") (unspec:VS [(match_operand: 2 "register_operand" "w") @@ -613,41 +633,6 @@ [(set_attr "type" "neon_dot")] ) -;; These expands map to the Dot Product optab the vectorizer checks for. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; idot_prod" - [(set (match_operand:VS 0 "register_operand") - (plus:VS (unspec:VS [(match_operand: 1 "register_operand") - (match_operand: 2 "register_operand")] - DOTPROD) - (match_operand:VS 3 "register_operand")))] - "TARGET_DOTPROD" -{ - emit_insn ( - gen_aarch64_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) - ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_dot_lane" @@ -944,8 +929,7 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], - abd, ones)); + emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones)); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 00d76ea..597f44c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b); } __extension__ extern __inline uint32x2_t