vect+aarch64: Fix ldp_stp_* regressions

author Richard Sandiford <richard.sandiford@arm.com>

Tue, 15 Feb 2022 18:09:33 +0000 (18:09 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Tue, 15 Feb 2022 18:09:33 +0000 (18:09 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Tue, 15 Feb 2022 18:09:33 +0000 (18:09 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Tue, 15 Feb 2022 18:09:33 +0000 (18:09 +0000)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index e3f18fb..1a460d4 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14932,6 +14932,31 @@ private:
       - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
    unsigned int m_vec_flags = 0;
  
+  /* At the moment, we do not model LDP and STP in the vector and scalar costs.
+     This means that code such as:
+
+       a[0] = x;
+       a[1] = x;
+
+     will be costed as two scalar instructions and two vector instructions
+     (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
+     wins if the costs are equal, because of the fact that the vector costs
+     include constant initializations whereas the scalar costs don't.
+     We would therefore tend to vectorize the code above, even though
+     the scalar version can use a single STP.
+
+     We should eventually fix this and model LDP and STP in the main costs;
+     see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
+     Until then, we look specifically for code that does nothing more than
+     STP-like operations.  We cost them on that basis in addition to the
+     normal latency-based costs.
+
+     If the scalar or vector code could be a sequence of STPs +
+     initialization, this variable counts the cost of the sequence,
+     with 2 units per instruction.  The variable is ~0U for other
+     kinds of code.  */
+  unsigned int m_stp_sequence_cost = 0;
+
    /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
       throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
       situations, we try to predict whether an Advanced SIMD implementation
@@ -15724,6 +15749,104 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
      }
  }
  
+/* Return true if STMT_INFO contains a memory access and if the constant
+   component of the memory address is aligned to SIZE bytes.  */
+static bool
+aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
+                                  poly_uint64 size)
+{
+  if (!STMT_VINFO_DATA_REF (stmt_info))
+    return false;
+
+  if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
+    stmt_info = first_stmt;
+  tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
+  /* Needed for gathers & scatters, for example.  */
+  if (!constant_offset)
+    return false;
+
+  return multiple_p (wi::to_poly_offset (constant_offset), size);
+}
+
+/* Check if a scalar or vector stmt could be part of a region of code
+   that does nothing more than store values to memory, in the scalar
+   case using STP.  Return the cost of the stmt if so, counting 2 for
+   one instruction.  Return ~0U otherwise.
+
+   The arguments are a subset of those passed to add_stmt_cost.  */
+unsigned int
+aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
+                          stmt_vec_info stmt_info, tree vectype)
+{
+  /* Code that stores vector constants uses a vector_load to create
+     the constant.  We don't apply the heuristic to that case for two
+     main reasons:
+
+     - At the moment, STPs are only formed via peephole2, and the
+       constant scalar moves would often come between STRs and so
+       prevent STP formation.
+
+     - The scalar code also has to load the constant somehow, and that
+       isn't costed.  */
+  switch (kind)
+    {
+    case scalar_to_vec:
+      /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
+      return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
+
+    case vec_construct:
+      if (FLOAT_TYPE_P (vectype))
+       /* Count 1 insn for the maximum number of FP->SIMD INS
+          instructions.  */
+       return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
+
+      /* Count 2 insns for a GPR->SIMD move and 2 insns for the
+        maximum number of GPR->SIMD INS instructions.  */
+      return vect_nunits_for_cost (vectype) * 4 * count;
+
+    case vector_store:
+    case unaligned_store:
+      /* Count 1 insn per vector if we can't form STP Q pairs.  */
+      if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+       return count * 2;
+      if (aarch64_tune_params.extra_tuning_flags
+         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+       return count * 2;
+
+      if (stmt_info)
+       {
+         /* Assume we won't be able to use STP if the constant offset
+            component of the address is misaligned.  ??? This could be
+            removed if we formed STP pairs earlier, rather than relying
+            on peephole2.  */
+         auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
+         if (!aarch64_aligned_constant_offset_p (stmt_info, size))
+           return count * 2;
+       }
+      return CEIL (count, 2) * 2;
+
+    case scalar_store:
+      if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
+       {
+         /* Check for a mode in which STP pairs can be formed.  */
+         auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
+         if (maybe_ne (size, 4) && maybe_ne (size, 8))
+           return ~0U;
+
+         /* Assume we won't be able to use STP if the constant offset
+            component of the address is misaligned.  ??? This could be
+            removed if we formed STP pairs earlier, rather than relying
+            on peephole2.  */
+         if (!aarch64_aligned_constant_offset_p (stmt_info, size))
+           return ~0U;
+       }
+      return count;
+
+    default:
+      return ~0U;
+    }
+}
+
  unsigned
  aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
                                      stmt_vec_info stmt_info, tree vectype,
@@ -15747,6 +15870,14 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
        m_analyzed_vinfo = true;
      }
  
+  /* Apply the heuristic described above m_stp_sequence_cost.  */
+  if (m_stp_sequence_cost != ~0U)
+    {
+      uint64_t cost = aarch64_stp_sequence_cost (count, kind,
+                                                stmt_info, vectype);
+      m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
+    }
+
    /* Try to get a more accurate cost by looking at STMT_INFO instead
       of just looking at KIND.  */
    if (stmt_info && aarch64_use_new_vector_costs_p ())
@@ -16017,6 +16148,15 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
      m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                            m_costs[vect_body]);
  
+  /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
+     the scalar code in the event of a tie, since there is more chance
+     of scalar code being optimized with surrounding operations.  */
+  if (!loop_vinfo
+      && scalar_costs
+      && m_stp_sequence_cost != ~0U
+      && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
+    m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+
    vector_costs::finish_cost (scalar_costs);
  }
  
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_14.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_14.c

new file mode 100644 (file)

index 0000000..c7b5f7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_14.c
@@ -0,0 +1,89 @@
+/* { dg-options "-O2 -fno-tree-loop-distribute-patterns" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include "ldp_stp_14.h"
+
+/*
+** const_2_int16_t_0:
+**     str     wzr, \[x0\]
+**     ret
+*/
+CONST_FN (2, int16_t, 0);
+
+/*
+** const_4_int16_t_0:
+**     str     xzr, \[x0\]
+**     ret
+*/
+CONST_FN (4, int16_t, 0);
+
+/*
+** const_8_int16_t_0:
+**     stp     xzr, xzr, \[x0\]
+**     ret
+*/
+CONST_FN (8, int16_t, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (16, int16_t, 0);
+
+/*
+** const_32_int16_t_0:
+**     movi    v([0-9]+)\.4s, .*
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONST_FN (32, int16_t, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (2, int16_t, 1);
+
+/*
+** const_4_int16_t_1:
+**     movi    v([0-9]+)\.4h, .*
+**     str     d\1, \[x0\]
+**     ret
+*/
+CONST_FN (4, int16_t, 1);
+
+/*
+** const_8_int16_t_1:
+**     movi    v([0-9]+)\.8h, .*
+**     str     q\1, \[x0\]
+**     ret
+*/
+CONST_FN (8, int16_t, 1);
+
+/* Fuzzy match due to PR104387.  */
+/*
+** dup_2_int16_t:
+**     ...
+**     strh    w1, \[x0, #?2\]
+**     ret
+*/
+DUP_FN (2, int16_t);
+
+/*
+** dup_4_int16_t:
+**     dup     v([0-9]+)\.4h, w1
+**     str     d\1, \[x0\]
+**     ret
+*/
+DUP_FN (4, int16_t);
+
+/*
+** dup_8_int16_t:
+**     dup     v([0-9]+)\.8h, w1
+**     str     q\1, \[x0\]
+**     ret
+*/
+DUP_FN (8, int16_t);
+
+/*
+** cons2_1_int16_t:
+**     strh    w1, \[x0\]
+**     strh    w2, \[x0, #?2\]
+**     ret
+*/
+CONS2_FN (1, int16_t);
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_14.h b/gcc/testsuite/gcc.target/aarch64/ldp_stp_14.h

new file mode 100644 (file)

index 0000000..39c463f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_14.h
@@ -0,0 +1,50 @@
+#include <stdint.h>
+
+#define PRAGMA(X) _Pragma (#X)
+#define UNROLL(COUNT) PRAGMA (GCC unroll (COUNT))
+
+#define CONST_FN(COUNT, TYPE, VAL)             \
+  void                                         \
+  const_##COUNT##_##TYPE##_##VAL (TYPE *x)     \
+  {                                            \
+    UNROLL (COUNT)                             \
+    for (int i = 0; i < COUNT; ++i)            \
+      x[i] = VAL;                              \
+  }
+
+#define DUP_FN(COUNT, TYPE)                    \
+  void                                         \
+  dup_##COUNT##_##TYPE (TYPE *x, TYPE val)     \
+  {                                            \
+    UNROLL (COUNT)                             \
+    for (int i = 0; i < COUNT; ++i)            \
+      x[i] = val;                              \
+  }
+
+#define CONS2_FN(COUNT, TYPE)                                  \
+  void                                                         \
+  cons2_##COUNT##_##TYPE (TYPE *x, TYPE val0, TYPE val1)       \
+  {                                                            \
+    UNROLL (COUNT)                                             \
+    for (int i = 0; i < COUNT * 2; i += 2)                     \
+      {                                                                \
+       x[i + 0] = val0;                                        \
+       x[i + 1] = val1;                                        \
+      }                                                                \
+  }
+
+#define CONS4_FN(COUNT, TYPE)                                  \
+  void                                                         \
+  cons4_##COUNT##_##TYPE (TYPE *x, TYPE val0, TYPE val1,       \
+                         TYPE val2, TYPE val3)                 \
+  {                                                            \
+    UNROLL (COUNT)                                             \
+    for (int i = 0; i < COUNT * 4; i += 4)                     \
+      {                                                                \
+       x[i + 0] = val0;                                        \
+       x[i + 1] = val1;                                        \
+       x[i + 2] = val2;                                        \
+       x[i + 3] = val3;                                        \
+      }                                                                \
+  }
+
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_15.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_15.c

new file mode 100644 (file)

index 0000000..131cd0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_15.c
@@ -0,0 +1,137 @@
+/* { dg-options "-O2 -fno-tree-loop-distribute-patterns" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include "ldp_stp_14.h"
+
+/*
+** const_2_int32_t_0:
+**     str     xzr, \[x0\]
+**     ret
+*/
+CONST_FN (2, int32_t, 0);
+
+/*
+** const_4_int32_t_0:
+**     stp     xzr, xzr, \[x0\]
+**     ret
+*/
+CONST_FN (4, int32_t, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (8, int32_t, 0);
+
+/*
+** const_16_int32_t_0:
+**     movi    v([0-9]+)\.4s, .*
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONST_FN (16, int32_t, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (2, int32_t, 1);
+
+/*
+** const_4_int32_t_1:
+**     movi    v([0-9]+)\.4s, .*
+**     str     q\1, \[x0\]
+**     ret
+*/
+CONST_FN (4, int32_t, 1);
+
+/*
+** const_8_int32_t_1:
+**     movi    v([0-9]+)\.4s, .*
+**     stp     q\1, q\1, \[x0\]
+**     ret
+*/
+CONST_FN (8, int32_t, 1);
+
+/*
+** dup_2_int32_t:
+**     stp     w1, w1, \[x0\]
+**     ret
+*/
+DUP_FN (2, int32_t);
+
+/*
+** dup_4_int32_t:
+**     stp     w1, w1, \[x0\]
+**     stp     w1, w1, \[x0, #?8\]
+**     ret
+*/
+DUP_FN (4, int32_t);
+
+/*
+** dup_8_int32_t:
+**     dup     v([0-9]+)\.4s, w1
+**     stp     q\1, q\1, \[x0\]
+**     ret
+*/
+DUP_FN (8, int32_t);
+
+/*
+** cons2_1_int32_t:
+**     stp     w1, w2, \[x0\]
+**     ret
+*/
+CONS2_FN (1, int32_t);
+
+/*
+** cons2_2_int32_t:
+**     stp     w1, w2, \[x0\]
+**     stp     w1, w2, \[x0, #?8\]
+**     ret
+*/
+CONS2_FN (2, int32_t);
+
+/*
+** cons2_4_int32_t:
+**     stp     w1, w2, \[x0\]
+**     stp     w1, w2, \[x0, #?8\]
+**     stp     w1, w2, \[x0, #?16\]
+**     stp     w1, w2, \[x0, #?24\]
+**     ret
+*/
+CONS2_FN (4, int32_t);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONS2_FN (8, int32_t);
+
+/*
+** cons2_16_int32_t:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS2_FN (16, int32_t);
+
+/*
+** cons4_1_int32_t:
+**     stp     w1, w2, \[x0\]
+**     stp     w3, w4, \[x0, #?8\]
+**     ret
+*/
+CONS4_FN (1, int32_t);
+
+/*
+** cons4_2_int32_t:
+**     stp     w1, w2, \[x0\]
+**     stp     w3, w4, \[x0, #?8\]
+**     stp     w1, w2, \[x0, #?16\]
+**     stp     w3, w4, \[x0, #?24\]
+**     ret
+*/
+CONS4_FN (2, int32_t);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONS4_FN (4, int32_t);
+
+/*
+** cons4_8_int32_t:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS4_FN (8, int32_t);
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c

new file mode 100644 (file)

index 0000000..8ab117c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -0,0 +1,133 @@
+/* { dg-options "-O2 -fno-tree-loop-distribute-patterns" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include "ldp_stp_14.h"
+
+/*
+** const_2_float_0:
+**     str     xzr, \[x0\]
+**     ret
+*/
+CONST_FN (2, float, 0);
+
+/*
+** const_4_float_0:
+**     stp     xzr, xzr, \[x0\]
+**     ret
+*/
+CONST_FN (4, float, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (8, float, 0);
+
+/*
+** const_16_float_0:
+**     movi    v([0-9]+)\.4s, .*
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONST_FN (16, float, 0);
+
+/*
+** const_2_float_1:
+**     fmov    v([0-9]+)\.2s, .*
+**     str     d\1, \[x0\]
+**     ret
+*/
+CONST_FN (2, float, 1);
+
+/*
+** const_4_float_1:
+**     fmov    v([0-9]+)\.4s, .*
+**     str     q\1, \[x0\]
+**     ret
+*/
+CONST_FN (4, float, 1);
+
+/*
+** dup_2_float:
+**     stp     s0, s0, \[x0\]
+**     ret
+*/
+DUP_FN (2, float);
+
+/* No preference between vectorizing or not vectorizing here.  */
+DUP_FN (4, float);
+
+/*
+** dup_8_float:
+**     dup     v([0-9]+)\.4s, v0.s\[0\]
+**     stp     q\1, q\1, \[x0\]
+**     ret
+*/
+DUP_FN (8, float);
+
+/*
+** cons2_1_float:
+**     stp     s0, s1, \[x0\]
+**     ret
+*/
+CONS2_FN (1, float);
+
+/*
+** cons2_2_float:
+**     stp     s0, s1, \[x0\]
+**     stp     s0, s1, \[x0, #?8\]
+**     ret
+*/
+CONS2_FN (2, float);
+
+/*
+** cons2_4_float:      { target aarch64_little_endian }
+**     ins     v0.s\[1\], v1.s\[0\]
+**     stp     d0, d0, \[x0\]
+**     stp     d0, d0, \[x0, #?16\]
+**     ret
+*/
+/*
+** cons2_4_float:      { target aarch64_big_endian }
+**     ins     v1.s\[1\], v0.s\[0\]
+**     stp     d1, d1, \[x0\]
+**     stp     d1, d1, \[x0, #?16\]
+**     ret
+*/
+CONS2_FN (4, float);
+
+/*
+** cons2_8_float:
+**     dup     v([0-9]+)\.4s, .*
+**     ...
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONS2_FN (8, float);
+
+/*
+** cons4_1_float:
+**     stp     s0, s1, \[x0\]
+**     stp     s2, s3, \[x0, #?8\]
+**     ret
+*/
+CONS4_FN (1, float);
+
+/*
+** cons4_2_float:
+**     stp     s0, s1, \[x0\]
+**     stp     s2, s3, \[x0, #?8\]
+**     stp     s0, s1, \[x0, #?16\]
+**     stp     s2, s3, \[x0, #?24\]
+**     ret
+*/
+CONS4_FN (2, float);
+
+/*
+** cons4_4_float:
+**     ins     v([0-9]+)\.s.*
+**     ...
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONS4_FN (4, float);
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_17.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_17.c

new file mode 100644 (file)

index 0000000..c1122fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_17.c
@@ -0,0 +1,120 @@
+/* { dg-options "-O2 -fno-tree-loop-distribute-patterns" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include "ldp_stp_14.h"
+
+/*
+** const_2_int64_t_0:
+**     stp     xzr, xzr, \[x0\]
+**     ret
+*/
+CONST_FN (2, int64_t, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (4, int64_t, 0);
+
+/*
+** const_8_int64_t_0:
+**     movi    v([0-9]+)\.4s, .*
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONST_FN (8, int64_t, 0);
+
+/*
+** dup_2_int64_t:
+**     stp     x1, x1, \[x0\]
+**     ret
+*/
+DUP_FN (2, int64_t);
+
+/*
+** dup_4_int64_t:
+**     stp     x1, x1, \[x0\]
+**     stp     x1, x1, \[x0, #?16\]
+**     ret
+*/
+DUP_FN (4, int64_t);
+
+/* No preference between vectorizing or not vectorizing here.  */
+DUP_FN (8, int64_t);
+
+/*
+** dup_16_int64_t:
+**     dup     v([0-9])\.2d, x1
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     stp     q\1, q\1, \[x0, #?64\]
+**     stp     q\1, q\1, \[x0, #?96\]
+**     ret
+*/
+DUP_FN (16, int64_t);
+
+/*
+** cons2_1_int64_t:
+**     stp     x1, x2, \[x0\]
+**     ret
+*/
+CONS2_FN (1, int64_t);
+
+/*
+** cons2_2_int64_t:
+**     stp     x1, x2, \[x0\]
+**     stp     x1, x2, \[x0, #?16\]
+**     ret
+*/
+CONS2_FN (2, int64_t);
+
+/*
+** cons2_4_int64_t:
+**     stp     x1, x2, \[x0\]
+**     stp     x1, x2, \[x0, #?16\]
+**     stp     x1, x2, \[x0, #?32\]
+**     stp     x1, x2, \[x0, #?48\]
+**     ret
+*/
+CONS2_FN (4, int64_t);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONS2_FN (8, int64_t);
+
+/*
+** cons2_16_int64_t:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS2_FN (16, int64_t);
+
+/*
+** cons4_1_int64_t:
+**     stp     x1, x2, \[x0\]
+**     stp     x3, x4, \[x0, #?16\]
+**     ret
+*/
+CONS4_FN (1, int64_t);
+
+/*
+** cons4_2_int64_t:
+**     stp     x1, x2, \[x0\]
+**     stp     x3, x4, \[x0, #?16\]
+**     stp     x1, x2, \[x0, #?32\]
+**     stp     x3, x4, \[x0, #?48\]
+**     ret
+*/
+CONS4_FN (2, int64_t);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONS4_FN (4, int64_t);
+
+/* We should probably vectorize this, but currently don't.  */
+CONS4_FN (8, int64_t);
+
+/*
+** cons4_16_int64_t:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS4_FN (16, int64_t);
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_18.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_18.c

new file mode 100644 (file)

index 0000000..eaa855c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_18.c
@@ -0,0 +1,123 @@
+/* { dg-options "-O2 -fno-tree-loop-distribute-patterns" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include "ldp_stp_14.h"
+
+/*
+** const_2_double_0:
+**     stp     xzr, xzr, \[x0\]
+**     ret
+*/
+CONST_FN (2, double, 0);
+
+/* No preference between vectorizing or not vectorizing here.  */
+CONST_FN (4, double, 0);
+
+/*
+** const_8_double_0:
+**     movi    v([0-9]+)\.2d, .*
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+CONST_FN (8, double, 0);
+
+/*
+** dup_2_double:
+**     stp     d0, d0, \[x0\]
+**     ret
+*/
+DUP_FN (2, double);
+
+/*
+** dup_4_double:
+**     stp     d0, d0, \[x0\]
+**     stp     d0, d0, \[x0, #?16\]
+**     ret
+*/
+DUP_FN (4, double);
+
+/*
+** dup_8_double:
+**     dup     v([0-9])\.2d, v0\.d\[0\]
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     ret
+*/
+DUP_FN (8, double);
+
+/*
+** dup_16_double:
+**     dup     v([0-9])\.2d, v0\.d\[0\]
+**     stp     q\1, q\1, \[x0\]
+**     stp     q\1, q\1, \[x0, #?32\]
+**     stp     q\1, q\1, \[x0, #?64\]
+**     stp     q\1, q\1, \[x0, #?96\]
+**     ret
+*/
+DUP_FN (16, double);
+
+/*
+** cons2_1_double:
+**     stp     d0, d1, \[x0\]
+**     ret
+*/
+CONS2_FN (1, double);
+
+/*
+** cons2_2_double:
+**     stp     d0, d1, \[x0\]
+**     stp     d0, d1, \[x0, #?16\]
+**     ret
+*/
+CONS2_FN (2, double);
+
+/*
+** cons2_4_double:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS2_FN (4, double);
+
+/*
+** cons2_8_double:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS2_FN (8, double);
+
+/*
+** cons4_1_double:
+**     stp     d0, d1, \[x0\]
+**     stp     d2, d3, \[x0, #?16\]
+**     ret
+*/
+CONS4_FN (1, double);
+
+/*
+** cons4_2_double:
+**     stp     d0, d1, \[x0\]
+**     stp     d2, d3, \[x0, #?16\]
+**     stp     d0, d1, \[x0, #?32\]
+**     stp     d2, d3, \[x0, #?48\]
+**     ret
+*/
+CONS4_FN (2, double);
+
+/*
+** cons2_8_double:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS4_FN (4, double);
+
+/*
+** cons2_8_double:
+**     ...
+**     stp     q[0-9]+, .*
+**     ret
+*/
+CONS4_FN (8, double);
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_19.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_19.c

new file mode 100644 (file)

index 0000000..9eb4163
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_19.c
@@ -0,0 +1,6 @@
+/* { dg-options "-O2 -mstrict-align" } */
+
+#include "ldp_stp_5.c"
+
+/* { dg-final { scan-assembler-times {stp\tq[0-9]+, q[0-9]} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {str\tq[0-9]+} 1 { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c

index 9426618..56d1d3c 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mstrict-align" } */
  
  double arr[4][4];
  
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc

index 273543d..c6b5a06 100644 (file)
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4533,6 +4533,37 @@ vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
    return true;
  }
  
+/* Return true if all elements of the slice are the same.  */
+bool
+vect_scalar_ops_slice::all_same_p () const
+{
+  for (unsigned int i = 1; i < length; ++i)
+    if (!operand_equal_p (op (0), op (i)))
+      return false;
+  return true;
+}
+
+hashval_t
+vect_scalar_ops_slice_hash::hash (const value_type &s)
+{
+  hashval_t hash = 0;
+  for (unsigned i = 0; i < s.length; ++i)
+    hash = iterative_hash_expr (s.op (i), hash);
+  return hash;
+}
+
+bool
+vect_scalar_ops_slice_hash::equal (const value_type &s1,
+                                  const compare_type &s2)
+{
+  if (s1.length != s2.length)
+    return false;
+  for (unsigned i = 0; i < s1.length; ++i)
+    if (!operand_equal_p (s1.op (i), s2.op (i)))
+      return false;
+  return true;
+}
+
  /* Compute the prologue cost for invariant or constant operands represented
     by NODE.  */
  
@@ -4549,45 +4580,39 @@ vect_prologue_cost_for_slp (slp_tree node,
       When all elements are the same we can use a splat.  */
    tree vectype = SLP_TREE_VECTYPE (node);
    unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
-  unsigned num_vects_to_check;
    unsigned HOST_WIDE_INT const_nunits;
    unsigned nelt_limit;
+  auto ops = &SLP_TREE_SCALAR_OPS (node);
+  auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
    if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
        && ! multiple_p (const_nunits, group_size))
      {
-      num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
        nelt_limit = const_nunits;
+      hash_set<vect_scalar_ops_slice_hash> vector_ops;
+      for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
+       if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
+         starts.quick_push (i * const_nunits);
      }
    else
      {
        /* If either the vector has variable length or the vectors
          are composed of repeated whole groups we only need to
          cost construction once.  All vectors will be the same.  */
-      num_vects_to_check = 1;
        nelt_limit = group_size;
+      starts.quick_push (0);
      }
-  tree elt = NULL_TREE;
-  unsigned nelt = 0;
-  for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
-    {
-      unsigned si = j % group_size;
-      if (nelt == 0)
-       elt = SLP_TREE_SCALAR_OPS (node)[si];
-      /* ???  We're just tracking whether all operands of a single
-        vector initializer are the same, ideally we'd check if
-        we emitted the same one already.  */
-      else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
-       elt = NULL_TREE;
-      nelt++;
-      if (nelt == nelt_limit)
-       {
-         record_stmt_cost (cost_vec, 1,
-                           SLP_TREE_DEF_TYPE (node) == vect_external_def
-                           ? (elt ? scalar_to_vec : vec_construct)
-                           : vector_load,
-                           NULL, vectype, 0, vect_prologue);
-         nelt = 0;
-       }
+  /* ???  We're just tracking whether vectors in a single node are the same.
+     Ideally we'd do something more global.  */
+  for (unsigned int start : starts)
+    {
+      vect_cost_for_stmt kind;
+      if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
+       kind = vector_load;
+      else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
+       kind = scalar_to_vec;
+      else
+       kind = vec_construct;
+      record_stmt_cost (cost_vec, 1, kind, NULL, vectype, 0, vect_prologue);
      }
  }
  
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index ec479d3..ddd0637 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -113,6 +113,41 @@ typedef hash_map<tree_operand_hash,
                  std::pair<stmt_vec_info, innermost_loop_behavior *> >
           vec_base_alignments;
  
+/* Represents elements [START, START + LENGTH) of cyclical array OPS*
+   (i.e. OPS repeated to give at least START + LENGTH elements)  */
+struct vect_scalar_ops_slice
+{
+  tree op (unsigned int i) const;
+  bool all_same_p () const;
+
+  vec<tree> *ops;
+  unsigned int start;
+  unsigned int length;
+};
+
+/* Return element I of the slice.  */
+inline tree
+vect_scalar_ops_slice::op (unsigned int i) const
+{
+  return (*ops)[(i + start) % ops->length ()];
+}
+
+/* Hash traits for vect_scalar_ops_slice.  */
+struct vect_scalar_ops_slice_hash : typed_noop_remove<vect_scalar_ops_slice>
+{
+  typedef vect_scalar_ops_slice value_type;
+  typedef vect_scalar_ops_slice compare_type;
+
+  static const bool empty_zero_p = true;
+
+  static void mark_deleted (value_type &s) { s.length = ~0U; }
+  static void mark_empty (value_type &s) { s.length = 0; }
+  static bool is_deleted (const value_type &s) { return s.length == ~0U; }
+  static bool is_empty (const value_type &s) { return s.length == 0; }
+  static hashval_t hash (const value_type &);
+  static bool equal (const value_type &, const compare_type &);
+};
+
  /************************************************************************
    SLP
   ************************************************************************/
author	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 15 Feb 2022 18:09:33 +0000 (18:09 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 15 Feb 2022 18:09:33 +0000 (18:09 +0000)
gcc/config/aarch64/aarch64.cc		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/ldp_stp_14.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_14.h	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_15.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_17.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_18.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_19.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c		patch \| blob \| history
gcc/tree-vect-slp.cc		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history