AArch32: correct dot-product RTL patterns.

author Tamar Christina <tamar.christina@arm.com>

Mon, 7 Feb 2022 12:54:42 +0000 (12:54 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Mon, 7 Feb 2022 12:56:37 +0000 (12:56 +0000)
author Tamar Christina <tamar.christina@arm.com>
Mon, 7 Feb 2022 12:54:42 +0000 (12:54 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Mon, 7 Feb 2022 12:56:37 +0000 (12:56 +0000)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h

index 9b6d599..fdfea33 100644 (file)
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
    return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index);
  }
  
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b,
+               const int __index)
+{
+  return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index);
+}
+
  #pragma GCC pop_options
  #endif
  
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def

index 865de65..c29ae3a 100644 (file)
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi)
  VAR2 (UTERNOP, udot, v8qi, v16qi)
  VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
  VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
+VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
+VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
  
  VAR1 (USTERNOP, usdot, v8qi)
  VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md

index e06c824..4a8987b 100644 (file)
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2866,20 +2866,49 @@
  })
  
  
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "neon_<sup>dot<vsi2qi>"
+;; These map to the auto-vectorizer Dot Product optab.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_insn "<sup>dot_prod<vsi2qi>"
    [(set (match_operand:VCVTI 0 "register_operand" "=w")
-       (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
-                   (unspec:VCVTI [(match_operand:<VSI2QI> 2
-                                                       "register_operand" "w")
-                                  (match_operand:<VSI2QI> 3
-                                                       "register_operand" "w")]
-               DOTPROD)))]
+       (plus:VCVTI
+         (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" "w")
+                        (match_operand:<VSI2QI> 2 "register_operand" "w")]
+                        DOTPROD)
+         (match_operand:VCVTI 3 "register_operand" "0")))]
    "TARGET_DOTPROD"
-  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
    [(set_attr "type" "neon_dot<q>")]
  )
  
+;; These instructions map to the __builtins for the Dot Product operations
+(define_expand "neon_<sup>dot<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+       (plus:VCVTI
+         (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand")
+                        (match_operand:<VSI2QI> 3 "register_operand")]
+                        DOTPROD)
+         (match_operand:VCVTI 1 "register_operand")))]
+  "TARGET_DOTPROD"
+)
+
  ;; These instructions map to the __builtins for the Dot Product operations.
  (define_insn "neon_usdot<vsi2qi>"
    [(set (match_operand:VCVTI 0 "register_operand" "=w")
@@ -2898,17 +2927,40 @@
  ;; indexed operations.
  (define_insn "neon_<sup>dot_lane<vsi2qi>"
    [(set (match_operand:VCVTI 0 "register_operand" "=w")
-       (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
-                   (unspec:VCVTI [(match_operand:<VSI2QI> 2
-                                                       "register_operand" "w")
-                                  (match_operand:V8QI 3 "register_operand" "t")
-                                  (match_operand:SI 4 "immediate_operand" "i")]
-               DOTPROD)))]
+       (plus:VCVTI
+         (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                        (match_operand:V8QI 3 "register_operand" "t")
+                        (match_operand:SI 4 "immediate_operand" "i")]
+                        DOTPROD)
+         (match_operand:VCVTI 1 "register_operand" "0")))]
+  "TARGET_DOTPROD"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+       (plus:VCVTI
+         (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                        (match_operand:V16QI 3 "register_operand" "t")
+                        (match_operand:SI 4 "immediate_operand" "i")]
+                        DOTPROD)
+         (match_operand:VCVTI 1 "register_operand" "0")))]
    "TARGET_DOTPROD"
    {
-    operands[4]
-      = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
-    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+    int lane = INTVAL (operands[4]);
+    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+      {
+       operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+       return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+    else
+      {
+       operands[4] = GEN_INT (lane);
+       return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+      }
    }
    [(set_attr "type" "neon_dot<q>")]
  )
@@ -2932,43 +2984,6 @@
    [(set_attr "type" "neon_dot<q>")]
  )
  
-;; These expands map to the Dot Product optab the vectorizer checks for.
-;; The auto-vectorizer expects a dot product builtin that also does an
-;; accumulation into the provided register.
-;; Given the following pattern
-;;
-;; for (i=0; i<len; i++) {
-;;     c = a[i] * b[i];
-;;     r += c;
-;; }
-;; return result;
-;;
-;; This can be auto-vectorized to
-;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
-;;
-;; given enough iterations.  However the vectorizer can keep unrolling the loop
-;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
-;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
-;; ...
-;;
-;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sup>dot_prod<vsi2qi>"
-  [(set (match_operand:VCVTI 0 "register_operand")
-       (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
-                                                       "register_operand")
-                                  (match_operand:<VSI2QI> 2
-                                                       "register_operand")]
-                    DOTPROD)
-                   (match_operand:VCVTI 3 "register_operand")))]
-  "TARGET_DOTPROD"
-{
-  emit_insn (
-    gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
-                                operands[2]));
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  DONE;
-})
-
  ;; Auto-vectorizer pattern for usdot
  (define_expand "usdot_prod<vsi2qi>"
    [(set (match_operand:VCVTI 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c

index b3bd3bf..d3541e8 100644 (file)
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
@@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t y)
    return vdotq_lane_s32 (r, x, y, 0);
  }
  
-/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */
+int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y)
+{
+  return vdot_laneq_s32 (r, x, y, 0);
+}
+
+int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+  return vdotq_laneq_s32 (r, x, y, 0);
+}
+
+int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y)
+{
+  return vdot_laneq_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+  return vdotq_laneq_s32 (r, x, y, 2);
+}
+
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 6 } } */
  /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */
-/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
  
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c

index 054f470..89a196e 100644 (file)
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
@@ -10,7 +10,7 @@ extern void abort();
  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  # define ORDER(x, y) y
  #else
-# define ORDER(x, y) x - y
+# define ORDER(x, y) (x - y)
  #endif
  
  #define P(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2
@@ -33,7 +33,20 @@ extern void abort();
         t3 f##_##rx1 = {0};                         \
         f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1));  \
         if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
-         abort (); \
+         abort ();
+
+#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2
+#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \
+       ARR(f, x, t1, r1);                  \
+       ARR(f, y, t2, r2);                  \
+       t3 f##_##rx = {0};                  \
+       f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2));  \
+       if (f##_##rx[0] != n1 || f##_##rx[1] != n2)   \
+         abort ();                                 \
+       t3 f##_##rx1 = {0};                         \
+       f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3));  \
+       if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
+         abort ();
  
  int
  main()
@@ -45,11 +58,16 @@ main()
    TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24);
  
    TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
-
    TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
  
    TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
    TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
  
+  TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, P(1,2), P2(2,3), 8, 16, 12, 24);
+  TEST_LANEQ (int8x8_t, int8x16_t, int32x2_t, vdot_laneq_s32, P(1,2), P2(-2,-3), -8, -16, -12, -24);
+
+  TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, P2(1,2), P2(2,3), 8, 16, 12, 24);
+  TEST_LANEQ (int8x16_t, int8x16_t, int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24);
+
    return 0;
  }
author	Tamar Christina <tamar.christina@arm.com>
	Mon, 7 Feb 2022 12:54:42 +0000 (12:54 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Mon, 7 Feb 2022 12:56:37 +0000 (12:56 +0000)
gcc/config/arm/arm_neon.h		patch \| blob \| history
gcc/config/arm/arm_neon_builtins.def		patch \| blob \| history
gcc/config/arm/neon.md		patch \| blob \| history
gcc/testsuite/gcc.target/arm/simd/vdot-compile.c		patch \| blob \| history
gcc/testsuite/gcc.target/arm/simd/vdot-exec.c		patch \| blob \| history