Not all vect_perm targets support that, and it's a bit too specific to have
its own effective-target selector, so we just test targets directly. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided4 && { ! { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target powerpc64*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided4 && { ! powerpc64*-*-* } } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided4 } } } } } */
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { ! vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
--- /dev/null
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+ for (int i = 0; i < n; i += 3)
+ {
+ x[i] = y[i] + z[i];
+ x[i + 1] = y[i + 1] - z[i + 1];
+ x[i + 2] = y[i + 2] | z[i + 2];
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tld3\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst3\t} 1 } } */
--- /dev/null
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+void __attribute ((noipa))
+foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ x[i] += y[i];
+ x[i + 1] += y[i + 1];
+ x[i + 2] |= y[i + 2];
+ x[i + 3] |= y[i + 3];
+ }
+}
+
+/* { dg-final { scan-assembler-not {\tld4\t} } } */
+/* { dg-final { scan-assembler-not {\tst4\t} } } */
--- /dev/null
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+void __attribute ((noipa))
+foo (uint32_t *__restrict x, uint32_t *__restrict y, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ x[i] += y[i];
+ x[i + 1] += y[i + 1];
+ x[i + 2] |= y[i + 2];
+ x[i + 3] |= y[i + 3];
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tld4\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst4\t} 1 } } */
--- /dev/null
+/* { dg-options "-O3" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+ for (int i = 0; i < n; i += 3)
+ {
+ x[i] = y[i] + z[i];
+ x[i + 1] = y[i + 1] - z[i + 1];
+ x[i + 2] = y[i + 2] | z[i + 2];
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tld3w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst3w\t} 1 } } */
--- /dev/null
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void __attribute ((noipa))
+foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ x[i] += y[i];
+ x[i + 1] += y[i + 1];
+ x[i + 2] |= y[i + 2];
+ x[i + 3] |= y[i + 3];
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tld4d\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst4d\t} 1 } } */
--- /dev/null
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void __attribute ((noipa))
+foo (uint32_t *__restrict x, uint32_t *__restrict y, int n)
+{
+ for (int i = 0; i < n; i += 4)
+ {
+ x[i] += y[i];
+ x[i + 1] += y[i + 1];
+ x[i + 2] |= y[i + 2];
+ x[i + 3] |= y[i + 3];
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tld4w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst4w\t} 1 } } */
return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
}
+/* STMT_INFO is a store group of size GROUP_SIZE that we are considering
+ splitting into two, with the first split group having size NEW_GROUP_SIZE.
+ Return true if we could use IFN_STORE_LANES instead and if that appears
+ to be the better approach. */
+
+static bool
+vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
+ unsigned int group_size,
+ unsigned int new_group_size)
+{
+ tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
+ tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+ if (!vectype)
+ return false;
+ /* Allow the split if one of the two new groups would operate on full
+ vectors *within* rather than across one scalar loop iteration.
+ This is purely a heuristic, but it should work well for group
+ sizes of 3 and 4, where the possible splits are:
+
+ 3->2+1: OK if the vector has exactly two elements
+ 4->2+2: Likewise
+ 4->3+1: Less clear-cut. */
+ if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
+ || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+ return false;
+ return vect_store_lanes_supported (vectype, group_size, false);
+}
+
/* Analyze an SLP instance starting from a group of grouped stores. Call
vect_build_slp_tree to build a tree of packed stmts if possible.
Return FALSE if it's impossible to SLP any stmt in the loop. */
/* For loop vectorization split into arbitrary pieces of size > 1. */
if (is_a <loop_vec_info> (vinfo)
- && (i > 1 && i < group_size))
+ && (i > 1 && i < group_size)
+ && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
unsigned group1_size = i;