From ab7e60cec1a6f4185b0428f3a2b3e71df0bae533 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 24 Aug 2018 13:05:36 +0000 Subject: [PATCH] Handle SLP permutations for variable-length vectors The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements. The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. 2018-08-24 Richard Sandiford gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. From-SVN: r263832 --- gcc/ChangeLog | 8 ++ gcc/testsuite/ChangeLog | 10 ++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c | 22 ++++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c | 22 ++++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c | 22 ++++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c | 22 ++++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c | 32 +++++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c | 22 ++++ gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c | 22 ++++ gcc/tree-vect-slp.c | 150 +++++++++++++--------- gcc/tree-vect-stmts.c | 17 ++- 11 files changed, 281 insertions(+), 68 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 72f4575..ac459b6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2018-08-24 Richard Sandiford + + * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out + the case in which the permute needs only a single element and + repeats for every vector of the result. Extend that case to + handle variable-length vectors. + * tree-vect-stmts.c (vectorizable_load): Update accordingly. + 2018-08-24 H.J. Lu PR debug/79342 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 813676b..db05964 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2018-08-24 Richard Sandiford + + * gcc.target/aarch64/sve/slp_perm_1.c: New test. + * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. + * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. + * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. + * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. + * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. + * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. + 2018-08-24 H.J. Lu PR debug/79342 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c new file mode 100644 index 0000000..0d4892e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 7] + 1; + a[i * 8 + 1] = b[i * 8 + 6] + 2; + a[i * 8 + 2] = b[i * 8 + 5] + 3; + a[i * 8 + 3] = b[i * 8 + 4] + 4; + a[i * 8 + 4] = b[i * 8 + 3] + 5; + a[i * 8 + 5] = b[i * 8 + 2] + 6; + a[i * 8 + 6] = b[i * 8 + 1] + 7; + a[i * 8 + 7] = b[i * 8 + 0] + 8; + } +} + +/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c new file mode 100644 index 0000000..86ace58 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 3] + 1; + a[i * 8 + 1] = b[i * 8 + 2] + 2; + a[i * 8 + 2] = b[i * 8 + 1] + 3; + a[i * 8 + 3] = b[i * 8 + 0] + 4; + a[i * 8 + 4] = b[i * 8 + 7] + 5; + a[i * 8 + 5] = b[i * 8 + 6] + 6; + a[i * 8 + 6] = b[i * 8 + 5] + 7; + a[i * 8 + 7] = b[i * 8 + 4] + 8; + } +} + +/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c new file mode 100644 index 0000000..d15215f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 1] + 1; + a[i * 8 + 1] = b[i * 8 + 0] + 2; + a[i * 8 + 2] = b[i * 8 + 3] + 3; + a[i * 8 + 3] = b[i * 8 + 2] + 4; + a[i * 8 + 4] = b[i * 8 + 5] + 5; + a[i * 8 + 5] = b[i * 8 + 4] + 6; + a[i * 8 + 6] = b[i * 8 + 7] + 7; + a[i * 8 + 7] = b[i * 8 + 6] + 8; + } +} + +/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c new file mode 100644 index 0000000..dc5262a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8] + c[i * 8]; + a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1]; + a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2]; + a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3]; + a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4]; + a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5]; + a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6]; + a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7]; + } +} + +/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c new file mode 100644 index 0000000..d5a48af --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b, + uint8_t *restrict c, uint8_t *restrict d) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = c[i * 8] + d[i * 8]; + a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1]; + a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2]; + a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3]; + a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4]; + a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5]; + a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6]; + a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7]; + b[i * 8] = c[i * 8 + 1] + d[i * 8]; + b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1]; + b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2]; + b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3]; + b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4]; + b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5]; + b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6]; + b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7]; + } +} + +/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ +/* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c new file mode 100644 index 0000000..2882461 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 3] + 1; + a[i * 8 + 1] = b[i * 8 + 6] + 1; + a[i * 8 + 2] = b[i * 8 + 0] + 1; + a[i * 8 + 3] = b[i * 8 + 2] + 1; + a[i * 8 + 4] = b[i * 8 + 1] + 1; + a[i * 8 + 5] = b[i * 8 + 7] + 1; + a[i * 8 + 6] = b[i * 8 + 5] + 1; + a[i * 8 + 7] = b[i * 8 + 4] + 1; + } +} + +/* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c new file mode 100644 index 0000000..da9e0a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +void +f (uint8_t *restrict a, uint8_t *restrict b) +{ + for (int i = 0; i < 100; ++i) + { + a[i * 8] = b[i * 8 + 1] + 1; + a[i * 8 + 1] = b[i * 8 + 7] + 2; + a[i * 8 + 2] = b[i * 8 + 1] + 3; + a[i * 8 + 3] = b[i * 8 + 7] + 4; + a[i * 8 + 4] = b[i * 8 + 1] + 5; + a[i * 8 + 5] = b[i * 8 + 7] + 6; + a[i * 8 + 6] = b[i * 8 + 1] + 7; + a[i * 8 + 7] = b[i * 8 + 7] + 8; + } +} + +/* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 0a9ce24..0ab7bd8 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; vec_info *vinfo = stmt_info->vinfo; - tree mask_element_type = NULL_TREE, mask_type; int vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); - int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); + unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int mask_element; machine_mode mode; - unsigned HOST_WIDE_INT nunits, const_vf; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; @@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); mode = TYPE_MODE (vectype); - - /* At the moment, all permutations are represented using per-element - indices, so we can't cope with variable vector lengths or - vectorization factors. */ - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) - || !vf.is_constant (&const_vf)) - return false; - - /* The generic VEC_PERM_EXPR code always uses an integral type of the - same size as the vector element being permuted. */ - mask_element_type = lang_hooks.types.type_for_mode - (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); - mask_type = get_vectype_for_scalar_type (mask_element_type); - vec_perm_builder mask (nunits, nunits, 1); - mask.quick_grow (nunits); - vec_perm_indices indices; + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); /* Initialize the vect stmts of NODE to properly insert the generated stmts later. */ @@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, bool noop_p = true; *n_perms = 0; - for (unsigned int j = 0; j < const_vf; j++) + vec_perm_builder mask; + unsigned int nelts_to_build; + unsigned int nvectors_per_build; + bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) + && multiple_p (nunits, group_size)); + if (repeating_p) { - for (int k = 0; k < group_size; k++) + /* A single vector contains a whole number of copies of the node, so: + (a) all permutes can use the same mask; and + (b) the permutes only need a single vector input. */ + mask.new_vector (nunits, group_size, 3); + nelts_to_build = mask.encoded_nelts (); + nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); + } + else + { + /* We need to construct a separate mask for each vector statement. */ + unsigned HOST_WIDE_INT const_nunits, const_vf; + if (!nunits.is_constant (&const_nunits) + || !vf.is_constant (&const_vf)) + return false; + mask.new_vector (const_nunits, const_nunits, 1); + nelts_to_build = const_vf * group_size; + nvectors_per_build = 1; + } + + unsigned int count = mask.encoded_nelts (); + mask.quick_grow (count); + vec_perm_indices indices; + + for (unsigned int j = 0; j < nelts_to_build; j++) + { + unsigned int iter_num = j / group_size; + unsigned int stmt_num = j % group_size; + unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); + if (repeating_p) { - unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] - + j * DR_GROUP_SIZE (stmt_info)); - vec_index = i / nunits; - mask_element = i % nunits; + first_vec_index = 0; + mask_element = i; + } + else + { + /* Enforced before the loop when !repeating_p. */ + unsigned int const_nunits = nunits.to_constant (); + vec_index = i / const_nunits; + mask_element = i % const_nunits; if (vec_index == first_vec_index || first_vec_index == -1) { @@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, || second_vec_index == -1) { second_vec_index = vec_index; - mask_element += nunits; + mask_element += const_nunits; } else { @@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, return false; } - gcc_assert (mask_element < 2 * nunits); - if (mask_element != index) - noop_p = false; - mask[index++] = mask_element; + gcc_assert (mask_element < 2 * const_nunits); + } + + if (mask_element != index) + noop_p = false; + mask[index++] = mask_element; - if (index == nunits && !noop_p) + if (index == count && !noop_p) + { + indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); + if (!can_vec_perm_const_p (mode, indices)) { - indices.new_vector (mask, 2, nunits); - if (!can_vec_perm_const_p (mode, indices)) + if (dump_enabled_p ()) { - if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "unsupported vect permute { "); + for (i = 0; i < count; ++i) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "unsupported vect permute { "); - for (i = 0; i < nunits; ++i) - { - dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); - dump_printf (MSG_MISSED_OPTIMIZATION, " "); - } - dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); + dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); + dump_printf (MSG_MISSED_OPTIMIZATION, " "); } - gcc_assert (analyze_only); - return false; + dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); } - - ++*n_perms; + gcc_assert (analyze_only); + return false; } - if (index == nunits) + ++*n_perms; + } + + if (index == count) + { + if (!analyze_only) { - if (!analyze_only) - { - tree mask_vec = NULL_TREE; + tree mask_vec = NULL_TREE; - if (! noop_p) - mask_vec = vec_perm_indices_to_tree (mask_type, indices); + if (! noop_p) + mask_vec = vect_gen_perm_mask_checked (vectype, indices); - if (second_vec_index == -1) - second_vec_index = first_vec_index; + if (second_vec_index == -1) + second_vec_index = first_vec_index; + for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) + { /* Generate the permute statement if necessary. */ - tree first_vec = dr_chain[first_vec_index]; - tree second_vec = dr_chain[second_vec_index]; + tree first_vec = dr_chain[first_vec_index + ri]; + tree second_vec = dr_chain[second_vec_index + ri]; stmt_vec_info perm_stmt_info; if (! noop_p) { @@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt_info; } - - index = 0; - first_vec_index = -1; - second_vec_index = -1; - noop_p = true; } + + index = 0; + first_vec_index = -1; + second_vec_index = -1; + noop_p = true; } } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 8fcb1e2..63fb1fb 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -8003,13 +8003,18 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (slp) { grouped_load = false; - /* For SLP permutation support we need to load the whole group, - not only the number of vector stmts the permutation result - fits in. */ - if (slp_perm) + /* If an SLP permutation is from N elements to N elements, + and if one vector holds a whole number of N, we can load + the inputs to the permutation in the same way as an + unpermuted sequence. In other cases we need to load the + whole group, not only the number of vector stmts the + permutation result fits in. */ + if (slp_perm + && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance) + || !multiple_p (nunits, group_size))) { - /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for - variable VF. */ + /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for + variable VF; see vect_transform_slp_perm_load. */ unsigned int const_vf = vf.to_constant (); unsigned int const_nunits = nunits.to_constant (); vec_num = CEIL (group_size * const_vf, const_nunits); -- 2.7.4