From d037d9ad323ec9eef3e50c6e2cbc1e31191daa5d Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 6 Apr 2022 08:56:07 +0100 Subject: [PATCH] vect: Fix mask handling for SLP gathers [PR103761] check_load_store_for_partial_vectors predates the support for SLP gathers and so had a hard-coded assumption that gathers/scatters (and load/stores lanes) would be non-SLP operations. This patch passes down the slp_node so that the routine can work out how many vectors are needed in both the SLP and non-SLP cases. gcc/ PR tree-optimization/103761 * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Replace the ncopies parameter with an slp_node parameter. Calculate the number of vectors based on it and vectype. Rename lambda to group_memory_nvectors. (vectorizable_store, vectorizable_load): Update calls accordingly. gcc/testsuite/ PR tree-optimization/103761 * gcc.dg/vect/pr103761.c: New test. * gcc.target/aarch64/sve/pr103761.c: Likewise. --- gcc/testsuite/gcc.dg/vect/pr103761.c | 13 +++++++++ gcc/testsuite/gcc.target/aarch64/sve/pr103761.c | 13 +++++++++ gcc/tree-vect-stmts.cc | 37 ++++++++++++++++--------- 3 files changed, 50 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr103761.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr103761.c diff --git a/gcc/testsuite/gcc.dg/vect/pr103761.c b/gcc/testsuite/gcc.dg/vect/pr103761.c new file mode 100644 index 0000000..0982a63 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr103761.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ + +void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a) +{ + for (int i = 0; i < 100; i += 4) + { + x[i] = (long) y[z[i]] + 1; + x[i + 1] = (long) y[z[i + 1]] + 2; + x[i + 2] = (long) y[z[i + 2]] + 3; + x[i + 3] = (long) y[z[i + 3]] + 4; + a[i] += 1; + } +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c b/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c new file mode 100644 index 0000000..001b4d4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c @@ -0,0 +1,13 @@ +/* { dg-options "-O3" } */ + +void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a) +{ + for (int i = 0; i < 100; i += 4) + { + x[i] = (long) y[z[i]] + 1; + x[i + 1] = (long) y[z[i + 1]] + 2; + x[i + 2] = (long) y[z[i + 2]] + 3; + x[i + 3] = (long) y[z[i + 3]] + 4; + a[i] += 1; + } +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index f6fc7e1..c0107c8 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1690,7 +1690,8 @@ static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info, as well as whether the target does. VLS_TYPE says whether the statement is a load or store and VECTYPE - is the type of the vector being loaded or stored. MEMORY_ACCESS_TYPE + is the type of the vector being loaded or stored. SLP_NODE is the SLP + node that contains the statement, or null if none. MEMORY_ACCESS_TYPE says how the load or store is going to be implemented and GROUP_SIZE is the number of load or store statements in the containing group. If the access is a gather load or scatter store, GS_INFO describes @@ -1703,11 +1704,11 @@ static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info, static void check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, + slp_tree slp_node, vec_load_store_type vls_type, int group_size, vect_memory_access_type memory_access_type, - unsigned int ncopies, gather_scatter_info *gs_info, tree scalar_mask) { @@ -1715,6 +1716,12 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, if (memory_access_type == VMAT_INVARIANT) return; + unsigned int nvectors; + if (slp_node) + nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + else + nvectors = vect_get_num_copies (loop_vinfo, vectype); + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); machine_mode vecmode = TYPE_MODE (vectype); bool is_load = (vls_type == VLS_LOAD); @@ -1732,7 +1739,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; return; } - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, + scalar_mask); return; } @@ -1754,7 +1762,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; return; } - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, + scalar_mask); return; } @@ -1784,7 +1793,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, /* We might load more scalars than we need for permuting SLP loads. We checked in get_group_load_store_type that the extra elements don't leak into a new vector. */ - auto get_valid_nvectors = [] (poly_uint64 size, poly_uint64 nunits) + auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits) { unsigned int nvectors; if (can_div_away_from_zero_p (size, nunits, &nvectors)) @@ -1799,7 +1808,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) && can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) { - unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits); + nvectors = group_memory_nvectors (group_size * vf, nunits); vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); using_partial_vectors_p = true; } @@ -1807,7 +1816,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, machine_mode vmode; if (get_len_load_store_mode (vecmode, is_load).exists (&vmode)) { - unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits); + nvectors = group_memory_nvectors (group_size * vf, nunits); vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode); vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor); @@ -7571,9 +7580,10 @@ vectorizable_store (vec_info *vinfo, if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) - check_load_store_for_partial_vectors (loop_vinfo, vectype, vls_type, - group_size, memory_access_type, - ncopies, &gs_info, mask); + check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node, + vls_type, group_size, + memory_access_type, &gs_info, + mask); if (slp_node && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], @@ -8921,9 +8931,10 @@ vectorizable_load (vec_info *vinfo, if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) - check_load_store_for_partial_vectors (loop_vinfo, vectype, VLS_LOAD, - group_size, memory_access_type, - ncopies, &gs_info, mask); + check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node, + VLS_LOAD, group_size, + memory_access_type, &gs_info, + mask); if (dump_enabled_p () && memory_access_type != VMAT_ELEMENTWISE -- 2.7.4