AArch64: correct dot-product RTL patterns for aarch64.
authorTamar Christina <tamar.christina@arm.com>
Mon, 26 Jul 2021 09:23:21 +0000 (10:23 +0100)
committerTamar Christina <tamar.christina@arm.com>
Mon, 26 Jul 2021 09:23:21 +0000 (10:23 +0100)
The previous fix for this problem was wrong due to a subtle difference between
where NEON expects the RMW values and where intrinsics expects them.

The insn pattern is modeled after the intrinsics and so needs an expand for
the vectorizer optab to switch the RTL.

However operand[3] is not expected to be written to so the current pattern is
bogus.

Instead I rewrite the RTL to be in canonical ordering and merge them.

gcc/ChangeLog:

* config/aarch64/aarch64-simd-builtins.def (sdot, udot): Rename to..
(sdot_prod, udot_prod): ... This.
* config/aarch64/aarch64-simd.md (aarch64_<sur>dot<vsi2qi>): Merged
into...
(<sur>dot_prod<vsi2qi>): ... this.
(aarch64_<sur>dot_lane<vsi2qi>, aarch64_<sur>dot_laneq<vsi2qi>):
Change operands order.
(<sur>sadv16qi): Use new operands order.
* config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32,
vdotq_s32): Use new RTL ordering.

gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/arm_neon.h

index 3bb45a8..402453a 100644 (file)
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
   /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot0, NONE)
-  BUILTIN_VB (TERNOPU, udot0, NONE)
+  BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
+  BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
   BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
   /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
index bf667b9..13c8698 100644 (file)
   DONE;
 })
 
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "aarch64_<sur>dot<vsi2qi>"
-  [(set (match_operand:VS 0 "register_operand" "=w")
-       (plus:VS (match_operand:VS 1 "register_operand" "0")
-               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-                           (match_operand:<VSI2QI> 3 "register_operand" "w")]
-               DOTPROD)))]
-  "TARGET_DOTPROD"
-  "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
-  [(set_attr "type" "neon_dot<q>")]
-)
-
-;; These expands map to the Dot Product optab the vectorizer checks for.
+;; These expands map to the Dot Product optab the vectorizer checks for
+;; and to the intrinsics patttern.
 ;; The auto-vectorizer expects a dot product builtin that also does an
 ;; accumulation into the provided register.
 ;; Given the following pattern
 ;; ...
 ;;
 ;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sur>dot_prod<vsi2qi>"
-  [(set (match_operand:VS 0 "register_operand")
-       (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
-                           (match_operand:<VSI2QI> 2 "register_operand")]
-                DOTPROD)
-               (match_operand:VS 3 "register_operand")))]
+(define_insn "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS
+         (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+                     (match_operand:<VSI2QI> 2 "register_operand" "w")]
+                     DOTPROD)
+         (match_operand:VS 3 "register_operand" "0")))]
   "TARGET_DOTPROD"
-{
-  emit_insn (
-    gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
-                                   operands[2]));
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  DONE;
-})
+  "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
-       (plus:VS (match_operand:VS 1 "register_operand" "0")
-               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-                           (match_operand:V8QI 3 "register_operand" "<h_con>")
-                           (match_operand:SI 4 "immediate_operand" "i")]
-               DOTPROD)))]
+       (plus:VS
+         (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                     (match_operand:V8QI 3 "register_operand" "<h_con>")
+                     (match_operand:SI 4 "immediate_operand" "i")]
+                     DOTPROD)
+         (match_operand:VS 1 "register_operand" "0")))]
   "TARGET_DOTPROD"
   {
     operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4]));
 
 (define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
-       (plus:VS (match_operand:VS 1 "register_operand" "0")
-               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-                           (match_operand:V16QI 3 "register_operand" "<h_con>")
-                           (match_operand:SI 4 "immediate_operand" "i")]
-               DOTPROD)))]
+       (plus:VS
+         (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                     (match_operand:V16QI 3 "register_operand" "<h_con>")
+                     (match_operand:SI 4 "immediate_operand" "i")]
+                     DOTPROD)
+         (match_operand:VS 1 "register_operand" "0")))]
   "TARGET_DOTPROD"
   {
     operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4]));
        rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
        rtx abd = gen_reg_rtx (V16QImode);
        emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
-       emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
-                                         abd, ones));
+       emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3]));
        DONE;
       }
     rtx reduc = gen_reg_rtx (V8HImode);
index 0f43994..313b35f 100644 (file)
@@ -31472,28 +31472,28 @@ __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
 }
 
 __extension__ extern __inline uint32x2_t