From: Richard Sandiford Date: Sat, 13 Jan 2018 17:59:32 +0000 (+0000) Subject: Handle peeling for alignment with masking X-Git-Tag: upstream/12.2.0~34122 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=535e7c114a7ad2ad7a6a0def88cf9448fcd5f029;p=platform%2Fupstream%2Fgcc.git Handle peeling for alignment with masking This patch adds support for aligning vectors by using a partial first iteration. E.g. if the start pointer is 3 elements beyond an aligned address, the first iteration will have a mask in which the first three elements are false. On SVE, the optimisation is only useful for vector-length-specific code. Vector-length-agnostic code doesn't try to align vectors since the vector length might not be a power of 2. 2018-01-13 Richard Sandiford Alan Hayward David Sherwood gcc/ * tree-vectorizer.h (_loop_vec_info::mask_skip_niters): New field. (LOOP_VINFO_MASK_SKIP_NITERS): New macro. (vect_use_loop_mask_for_alignment_p): New function. (vect_prepare_for_masked_peels, vect_gen_while_not): Declare. * tree-vect-loop-manip.c (vect_set_loop_masks_directly): Add an niters_skip argument. Make sure that the first niters_skip elements of the first iteration are inactive. (vect_set_loop_condition_masked): Handle LOOP_VINFO_MASK_SKIP_NITERS. Update call to vect_set_loop_masks_directly. (get_misalign_in_elems): New function, split out from... (vect_gen_prolog_loop_niters): ...here. (vect_update_init_of_dr): Take a code argument that specifies whether the adjustment should be added or subtracted. (vect_update_init_of_drs): Likewise. (vect_prepare_for_masked_peels): New function. (vect_do_peeling): Skip prologue peeling if we're using a mask instead. Update call to vect_update_inits_of_drs. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize mask_skip_niters. (vect_analyze_loop_2): Allow fully-masked loops with peeling for alignment. Do not include the number of peeled iterations in the minimum threshold in that case. (vectorizable_induction): Adjust the start value down by LOOP_VINFO_MASK_SKIP_NITERS iterations. (vect_transform_loop): Call vect_prepare_for_masked_peels. Take the number of skipped iterations into account when calculating the loop bounds. * tree-vect-stmts.c (vect_gen_while_not): New function. gcc/testsuite/ * gcc.target/aarch64/sve/nopeel_1.c: New test. * gcc.target/aarch64/sve/peel_ind_1.c: Likewise. * gcc.target/aarch64/sve/peel_ind_1_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_2.c: Likewise. * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_3.c: Likewise. * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_4.c: Likewise. * gcc.target/aarch64/sve/peel_ind_4_run.c: Likewise. Co-Authored-By: Alan Hayward Co-Authored-By: David Sherwood From-SVN: r256630 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 01fa70b..c4cbe9d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -2,6 +2,39 @@ Alan Hayward David Sherwood + * tree-vectorizer.h (_loop_vec_info::mask_skip_niters): New field. + (LOOP_VINFO_MASK_SKIP_NITERS): New macro. + (vect_use_loop_mask_for_alignment_p): New function. + (vect_prepare_for_masked_peels, vect_gen_while_not): Declare. + * tree-vect-loop-manip.c (vect_set_loop_masks_directly): Add an + niters_skip argument. Make sure that the first niters_skip elements + of the first iteration are inactive. + (vect_set_loop_condition_masked): Handle LOOP_VINFO_MASK_SKIP_NITERS. + Update call to vect_set_loop_masks_directly. + (get_misalign_in_elems): New function, split out from... + (vect_gen_prolog_loop_niters): ...here. + (vect_update_init_of_dr): Take a code argument that specifies whether + the adjustment should be added or subtracted. + (vect_update_init_of_drs): Likewise. + (vect_prepare_for_masked_peels): New function. + (vect_do_peeling): Skip prologue peeling if we're using a mask + instead. Update call to vect_update_inits_of_drs. + * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize + mask_skip_niters. + (vect_analyze_loop_2): Allow fully-masked loops with peeling for + alignment. Do not include the number of peeled iterations in + the minimum threshold in that case. + (vectorizable_induction): Adjust the start value down by + LOOP_VINFO_MASK_SKIP_NITERS iterations. + (vect_transform_loop): Call vect_prepare_for_masked_peels. + Take the number of skipped iterations into account when calculating + the loop bounds. + * tree-vect-stmts.c (vect_gen_while_not): New function. + +2018-01-13 Richard Sandiford + Alan Hayward + David Sherwood + * doc/sourcebuild.texi (vect_fully_masked): Document. * params.def (PARAM_MIN_VECT_LOOP_BOUND): Change minimum and default value to 0. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 3c83b5d..55e58c2 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -2,6 +2,20 @@ Alan Hayward David Sherwood + * gcc.target/aarch64/sve/nopeel_1.c: New test. + * gcc.target/aarch64/sve/peel_ind_1.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_1_run.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_2.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_3.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_4.c: Likewise. + * gcc.target/aarch64/sve/peel_ind_4_run.c: Likewise. + +2018-01-13 Richard Sandiford + Alan Hayward + David Sherwood + * lib/target-supports.exp (check_effective_target_vect_fully_masked): New proc. * gcc.dg/vect/slp-3.c: Expect all loops to be vectorized if diff --git a/gcc/testsuite/gcc.target/aarch64/sve/nopeel_1.c b/gcc/testsuite/gcc.target/aarch64/sve/nopeel_1.c new file mode 100644 index 0000000..d77c32c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/nopeel_1.c @@ -0,0 +1,39 @@ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256" } */ + +#include + +#define TEST(NAME, TYPE) \ + void \ + NAME##1 (TYPE *x, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + x[i] += 1; \ + } \ + TYPE NAME##_array[1024]; \ + void \ + NAME##2 (void) \ + { \ + for (int i = 1; i < 200; ++i) \ + NAME##_array[i] += 1; \ + } + +TEST (s8, int8_t) +TEST (u8, uint8_t) +TEST (s16, int16_t) +TEST (u16, uint16_t) +TEST (s32, int32_t) +TEST (u32, uint32_t) +TEST (s64, int64_t) +TEST (u64, uint64_t) +TEST (f16, _Float16) +TEST (f32, float) +TEST (f64, double) + +/* No scalar memory accesses. */ +/* { dg-final { scan-assembler-not {[wx][0-9]*, \[} } } */ +/* 2 for each NAME##1 test, one in the header and one in the main loop + and 1 for each NAME##2 test, in the main loop only. */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 6 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 9 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 9 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 9 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c new file mode 100644 index 0000000..8640264 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */ + +#define N 512 +#define START 1 +#define END 505 + +int x[N] __attribute__((aligned(32))); + +void __attribute__((noinline, noclone)) +foo (void) +{ + unsigned int v = 0; + for (unsigned int i = START; i < END; ++i) + { + x[i] = v; + v += 5; + } +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */ +/* We should use an induction that starts at -5, with only the last + 7 elements of the first iteration being active. */ +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1_run.c new file mode 100644 index 0000000..3fa0e46 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1_run.c @@ -0,0 +1,18 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 -mtune=thunderx" } */ +/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "peel_ind_1.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + foo (); + for (int i = 0; i < N; ++i) + { + if (x[i] != (i < START || i >= END ? 0 : (i - START) * 5)) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c new file mode 100644 index 0000000..2bfc09a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */ + +#define N 512 +#define START 7 +#define END 22 + +int x[N] __attribute__((aligned(32))); + +void __attribute__((noinline, noclone)) +foo (void) +{ + for (unsigned int i = START; i < END; ++i) + x[i] = i; +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */ +/* We should unroll the loop three times. */ +/* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c new file mode 100644 index 0000000..9c5ae1b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c @@ -0,0 +1,18 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 -mtune=thunderx" } */ +/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "peel_ind_2.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + foo (); + for (int i = 0; i < N; ++i) + { + if (x[i] != (i < START || i >= END ? 0 : i)) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c new file mode 100644 index 0000000..8364dc6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */ + +#define N 32 +#define MAX_START 8 +#define COUNT 16 + +int x[MAX_START][N] __attribute__((aligned(32))); + +void __attribute__((noinline, noclone)) +foo (int start) +{ + for (int i = start; i < start + COUNT; ++i) + x[start][i] = i; +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */ +/* { dg-final { scan-assembler {\tubfx\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c new file mode 100644 index 0000000..384a38e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c @@ -0,0 +1,21 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 -mtune=thunderx" } */ +/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "peel_ind_3.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + for (int start = 0; start < MAX_START; ++start) + { + foo (start); + for (int i = 0; i < N; ++i) + { + if (x[start][i] != (i < start || i >= start + COUNT ? 0 : i)) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + } + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4.c new file mode 100644 index 0000000..5b5d88a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* Pick an arbitrary target for which unaligned accesses are more + expensive. */ +/* { dg-options "-Ofast -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */ + +#define START 1 +#define END 505 + +void __attribute__((noinline, noclone)) +foo (double *x) +{ + double v = 10.0; + for (unsigned int i = START; i < END; ++i) + { + x[i] = v; + v += 5.0; + } +} + +/* We should operate on aligned vectors. */ +/* { dg-final { scan-assembler {\tubfx\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4_run.c new file mode 100644 index 0000000..7834a62 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_4_run.c @@ -0,0 +1,29 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-Ofast -mtune=thunderx" } */ +/* { dg-options "-Ofast -mtune=thunderx -mtune=thunderx" { target aarch64_sve256_hw } } */ + +#include "peel_ind_4.c" + +int __attribute__ ((optimize (1))) +main (void) +{ + double x[END + 1]; + for (int i = 0; i < END + 1; ++i) + { + x[i] = i; + asm volatile ("" ::: "memory"); + } + foo (x); + for (int i = 0; i < END + 1; ++i) + { + double expected; + if (i < START || i >= END) + expected = i; + else + expected = 10 + (i - START) * 5; + if (x[i] != expected) + __builtin_abort (); + asm volatile ("" ::: "memory"); + } + return 0; +} diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 496df38..b9bb047 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -384,6 +384,11 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, times and has been vectorized according to LOOP_VINFO. Each iteration of the vectorized loop handles VF iterations of the scalar loop. + If NITERS_SKIP is nonnull, the first iteration of the vectorized loop + starts with NITERS_SKIP dummy iterations of the scalar loop before + the real work starts. The mask elements for these dummy iterations + must be 0, to ensure that the extra iterations do not have an effect. + It is known that: NITERS * RGM->max_nscalars_per_iter @@ -395,7 +400,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, might overflow before hitting a value above: - NITERS * RGM->max_nscalars_per_iter + (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter This means that we cannot guarantee that such an induction variable would ever hit a value that produces a set of all-false masks for RGM. */ @@ -405,7 +410,8 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, gimple_seq *preheader_seq, gimple_stmt_iterator loop_cond_gsi, rgroup_masks *rgm, tree vf, - tree niters, bool might_wrap_p) + tree niters, tree niters_skip, + bool might_wrap_p) { tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); tree mask_type = rgm->mask_type; @@ -413,10 +419,12 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); /* Calculate the maximum number of scalar values that the rgroup - handles in total and the number that it handles for each iteration - of the vector loop. */ + handles in total, the number that it handles for each iteration + of the vector loop, and the number that it should skip during the + first iteration of the vector loop. */ tree nscalars_total = niters; tree nscalars_step = vf; + tree nscalars_skip = niters_skip; if (nscalars_per_iter != 1) { /* We checked before choosing to use a fully-masked loop that these @@ -426,6 +434,9 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, nscalars_total, factor); nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type, nscalars_step, factor); + if (nscalars_skip) + nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, + nscalars_skip, factor); } /* Create an induction variable that counts the number of scalars @@ -438,29 +449,66 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi, insert_after, &index_before_incr, &index_after_incr); - tree test_index, test_limit; + tree test_index, test_limit, first_limit; gimple_stmt_iterator *test_gsi; if (might_wrap_p) { /* In principle the loop should stop iterating once the incremented - IV reaches a value greater than or equal to NSCALAR_TOTAL. - However, there's no guarantee that the IV hits a value above - this value before wrapping around. We therefore adjust the - limit down by one IV step: + IV reaches a value greater than or equal to: + + NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP + + However, there's no guarantee that this addition doesn't overflow + the comparison type, or that the IV hits a value above it before + wrapping around. We therefore adjust the limit down by one + IV step: - NSCALARS_TOTAL -[infinite-prec] NSCALARS_STEP + (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) + -[infinite-prec] NSCALARS_STEP and compare the IV against this limit _before_ incrementing it. Since the comparison type is unsigned, we actually want the subtraction to saturate at zero: - NSCALARS_TOTAL -[sat] NSCALARS_STEP. */ + (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) + -[sat] NSCALARS_STEP + + And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as: + + NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP) + + where the rightmost subtraction can be done directly in + COMPARE_TYPE. */ test_index = index_before_incr; + tree adjust = nscalars_step; + if (nscalars_skip) + adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, + adjust, nscalars_skip); test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, - nscalars_total, nscalars_step); + nscalars_total, adjust); test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type, - test_limit, nscalars_step); + test_limit, adjust); test_gsi = &incr_gsi; + + /* Get a safe limit for the first iteration. */ + if (nscalars_skip) + { + /* The first vector iteration can handle at most NSCALARS_STEP + scalars. NSCALARS_STEP <= CONST_LIMIT, and adding + NSCALARS_SKIP to that cannot overflow. */ + tree const_limit = build_int_cst (compare_type, + LOOP_VINFO_VECT_FACTOR (loop_vinfo) + * nscalars_per_iter); + first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type, + nscalars_total, const_limit); + first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, + first_limit, nscalars_skip); + } + else + /* For the first iteration it doesn't matter whether the IV hits + a value above NSCALARS_TOTAL. That only matters for the latch + condition. */ + first_limit = nscalars_total; } else { @@ -468,7 +516,12 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, the bound before wrapping. */ test_index = index_after_incr; test_limit = nscalars_total; + if (nscalars_skip) + test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, + test_limit, nscalars_skip); test_gsi = &loop_cond_gsi; + + first_limit = test_limit; } /* Provide a definition of each mask in the group. */ @@ -487,7 +540,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, to have a full mask. */ poly_uint64 const_limit; bool first_iteration_full - = (poly_int_tree_p (nscalars_total, &const_limit) + = (poly_int_tree_p (first_limit, &const_limit) && known_ge (const_limit, (i + 1) * nscalars_per_mask)); /* Rather than have a new IV that starts at BIAS and goes up to @@ -504,12 +557,13 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, bias_tree); } - /* Create the initial mask. */ + /* Create the initial mask. First include all scalars that + are within the loop limit. */ tree init_mask = NULL_TREE; if (!first_iteration_full) { tree start, end; - if (nscalars_total == test_limit) + if (first_limit == test_limit) { /* Use a natural test between zero (the initial IV value) and the loop limit. The "else" block would be valid too, @@ -520,8 +574,11 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, } else { + /* FIRST_LIMIT is the maximum number of scalars handled by the + first iteration of the vector loop. Test the portion + associated with this mask. */ start = bias_tree; - end = nscalars_total; + end = first_limit; } init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask"); @@ -529,6 +586,22 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, gimple_seq_add_stmt (preheader_seq, tmp_stmt); } + /* Now AND out the bits that are within the number of skipped + scalars. */ + poly_uint64 const_skip; + if (nscalars_skip + && !(poly_int_tree_p (nscalars_skip, &const_skip) + && known_le (const_skip, bias))) + { + tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type, + bias_tree, nscalars_skip); + if (init_mask) + init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type, + init_mask, unskipped_mask); + else + init_mask = unskipped_mask; + } + if (!init_mask) /* First iteration is full. */ init_mask = build_minus_one_cst (mask_type); @@ -586,6 +659,9 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, else niters = gimple_convert (&preheader_seq, compare_type, niters); + /* Convert skip_niters to the right type. */ + tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + /* Now calculate the value that the induction variable must be able to hit in order to ensure that we end the loop with an all-false mask. This involves adding the maximum number of inactive trailing scalar @@ -594,6 +670,15 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, bool known_max_iters = max_loop_iterations (loop, &iv_limit); if (known_max_iters) { + if (niters_skip) + { + /* Add the maximum number of skipped iterations to the + maximum iteration count. */ + if (TREE_CODE (niters_skip) == INTEGER_CST) + iv_limit += wi::to_widest (niters_skip); + else + iv_limit += max_vf - 1; + } /* IV_LIMIT is the maximum number of latch iterations, which is also the maximum in-range IV value. Round this value down to the previous vector alignment boundary and then add an extra full iteration. */ @@ -639,7 +724,8 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, &preheader_seq, loop_cond_gsi, rgm, vf, - niters, might_wrap_p); + niters, niters_skip, + might_wrap_p); } /* Emit all accumulated statements. */ @@ -1463,6 +1549,46 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, } } +/* Return a gimple value containing the misalignment (measured in vector + elements) for the loop described by LOOP_VINFO, i.e. how many elements + it is away from a perfectly aligned address. Add any new statements + to SEQ. */ + +static tree +get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo) +{ + struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); + gimple *dr_stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + + unsigned int target_align = DR_TARGET_ALIGNMENT (dr); + gcc_assert (target_align != 0); + + bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; + tree offset = (negative + ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) + : size_zero_node); + tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq, + offset); + tree type = unsigned_type_for (TREE_TYPE (start_addr)); + tree target_align_minus_1 = build_int_cst (type, target_align - 1); + HOST_WIDE_INT elem_size + = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); + tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); + + /* Create: misalign_in_bytes = addr & (target_align - 1). */ + tree int_start_addr = fold_convert (type, start_addr); + tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr, + target_align_minus_1); + + /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ + tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, + elem_size_log); + + return misalign_in_elems; +} + /* Function vect_gen_prolog_loop_niters Generate the number of iterations which should be peeled as prolog for the @@ -1474,7 +1600,7 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, If the misalignment of DR is known at compile time: addr_mis = int mis = DR_MISALIGNMENT (dr); Else, compute address misalignment in bytes: - addr_mis = addr & (vectype_align - 1) + addr_mis = addr & (target_align - 1) prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step @@ -1521,33 +1647,17 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, } else { - bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; - tree offset = negative - ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; - tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, - &stmts, offset); - tree type = unsigned_type_for (TREE_TYPE (start_addr)); - tree target_align_minus_1 = build_int_cst (type, target_align - 1); + tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo); + tree type = TREE_TYPE (misalign_in_elems); HOST_WIDE_INT elem_size = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); - tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); HOST_WIDE_INT align_in_elems = target_align / elem_size; tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1); tree align_in_elems_tree = build_int_cst (type, align_in_elems); - tree misalign_in_bytes; - tree misalign_in_elems; - - /* Create: misalign_in_bytes = addr & (target_align - 1). */ - misalign_in_bytes - = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), - target_align_minus_1); - - /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ - misalign_in_elems - = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log); /* Create: (niters_type) ((align_in_elems - misalign_in_elems) & (align_in_elems - 1)). */ + bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; if (negative) iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, align_in_elems_tree); @@ -1587,20 +1697,22 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, /* Function vect_update_init_of_dr - NITERS iterations were peeled from LOOP. DR represents a data reference - in LOOP. This function updates the information recorded in DR to - account for the fact that the first NITERS iterations had already been - executed. Specifically, it updates the OFFSET field of DR. */ + If CODE is PLUS, the vector loop starts NITERS iterations after the + scalar one, otherwise CODE is MINUS and the vector loop starts NITERS + iterations before the scalar one (using masking to skip inactive + elements). This function updates the information recorded in DR to + account for the difference. Specifically, it updates the OFFSET + field of DR. */ static void -vect_update_init_of_dr (struct data_reference *dr, tree niters) +vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code) { tree offset = DR_OFFSET (dr); niters = fold_build2 (MULT_EXPR, sizetype, fold_convert (sizetype, niters), fold_convert (sizetype, DR_STEP (dr))); - offset = fold_build2 (PLUS_EXPR, sizetype, + offset = fold_build2 (code, sizetype, fold_convert (sizetype, offset), niters); DR_OFFSET (dr) = offset; } @@ -1608,14 +1720,12 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters) /* Function vect_update_inits_of_drs - NITERS iterations were peeled from the loop represented by LOOP_VINFO. - This function updates the information recorded for the data references in - the loop to account for the fact that the first NITERS iterations had - already been executed. Specifically, it updates the initial_condition of - the access_function of all the data_references in the loop. */ + Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. + CODE and NITERS are as for vect_update_inits_of_dr. */ static void -vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) +vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, + tree_code code) { unsigned int i; vec datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); @@ -1642,9 +1752,57 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) } FOR_EACH_VEC_ELT (datarefs, i, dr) - vect_update_init_of_dr (dr, niters); + vect_update_init_of_dr (dr, niters, code); } +/* For the information recorded in LOOP_VINFO prepare the loop for peeling + by masking. This involves calculating the number of iterations to + be peeled and then aligning all memory references appropriately. */ + +void +vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) +{ + tree misalign_in_elems; + tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); + + gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo)); + + /* From the information recorded in LOOP_VINFO get the number of iterations + that need to be skipped via masking. */ + if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) + { + poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) + - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); + misalign_in_elems = build_int_cst (type, misalign); + } + else + { + gimple_seq seq1 = NULL, seq2 = NULL; + misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo); + misalign_in_elems = fold_convert (type, misalign_in_elems); + misalign_in_elems = force_gimple_operand (misalign_in_elems, + &seq2, true, NULL_TREE); + gimple_seq_add_seq (&seq1, seq2); + if (seq1) + { + edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); + basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1); + gcc_assert (!new_bb); + } + } + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "misalignment for fully-masked loop: "); + dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems); + dump_printf (MSG_NOTE, "\n"); + } + + LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems; + + vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR); +} /* This function builds ni_name = number of iterations. Statements are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set @@ -2250,7 +2408,9 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, int bound_prolog = 0; poly_uint64 bound_scalar = 0; int estimated_vf; - int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + int prolog_peeling = 0; + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) + prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); @@ -2367,7 +2527,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, scale_loop_profile (prolog, prob_prolog, bound_prolog); } /* Update init address of DRs. */ - vect_update_inits_of_drs (loop_vinfo, niters_prolog); + vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); /* Update niters for vector loop. */ LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 1666332..d7cc12f 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1121,6 +1121,7 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in) versioning_threshold (0), vectorization_factor (0), max_vectorization_factor (0), + mask_skip_niters (NULL_TREE), mask_compare_type (NULL_TREE), unaligned_dr (NULL), peeling_for_alignment (0), @@ -2269,16 +2270,6 @@ start_over: " gaps is required.\n"); } - if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) - { - LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't use a fully-masked loop because peeling for" - " alignment is required.\n"); - } - /* Decide whether to use a fully-masked loop for this vectorization factor. */ LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) @@ -2379,18 +2370,21 @@ start_over: increase threshold for this case if necessary. */ if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) { - poly_uint64 niters_th; + poly_uint64 niters_th = 0; - /* Niters for peeled prolog loop. */ - if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) { - struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); - - niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1; + /* Niters for peeled prolog loop. */ + if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) + { + struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); + tree vectype + = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); + niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; + } + else + niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); } - else - niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); /* Niters for at least one iteration of vectorized loop. */ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) @@ -7336,9 +7330,28 @@ vectorizable_induction (gimple *phi, init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (iv_loop)); - /* Convert the step to the desired type. */ + /* Convert the initial value and step to the desired type. */ stmts = NULL; + init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); + + /* If we are using the loop mask to "peel" for alignment then we need + to adjust the start value here. */ + tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + if (skip_niters != NULL_TREE) + { + if (FLOAT_TYPE_P (vectype)) + skip_niters = gimple_build (&stmts, FLOAT_EXPR, TREE_TYPE (vectype), + skip_niters); + else + skip_niters = gimple_convert (&stmts, TREE_TYPE (vectype), + skip_niters); + tree skip_step = gimple_build (&stmts, MULT_EXPR, TREE_TYPE (vectype), + skip_niters, step_expr); + init_expr = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (vectype), + init_expr, skip_step); + } + if (stmts) { new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); @@ -8209,6 +8222,11 @@ vect_transform_loop (loop_vec_info loop_vinfo) split_edge (loop_preheader_edge (loop)); + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && vect_use_loop_mask_for_alignment_p (loop_vinfo)) + /* This will deal with any possible peeling. */ + vect_prepare_for_masked_peels (loop_vinfo); + /* FORNOW: the vectorizer supports only loops which body consist of one basic block (header + empty latch). When the vectorizer will support more involved loop forms, the order by which the BBs are @@ -8488,29 +8506,40 @@ vect_transform_loop (loop_vec_info loop_vinfo) /* +1 to convert latch counts to loop iteration counts, -min_epilogue_iters to remove iterations that cannot be performed by the vector code. */ - int bias = 1 - min_epilogue_iters; + int bias_for_lowest = 1 - min_epilogue_iters; + int bias_for_assumed = bias_for_lowest; + int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + /* When the amount of peeling is known at compile time, the first + iteration will have exactly alignment_npeels active elements. + In the worst case it will have at least one. */ + int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); + bias_for_lowest += lowest_vf - min_first_active; + bias_for_assumed += assumed_vf - min_first_active; + } /* In these calculations the "- 1" converts loop iteration counts back to latch counts. */ if (loop->any_upper_bound) loop->nb_iterations_upper_bound = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias, + ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, lowest_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_upper_bound + bias, + : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, lowest_vf) - 1); if (loop->any_likely_upper_bound) loop->nb_iterations_likely_upper_bound = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias, - lowest_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, - lowest_vf) - 1); + ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + + bias_for_lowest, lowest_vf) - 1 + : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + + bias_for_lowest, lowest_vf) - 1); if (loop->any_estimate) loop->nb_iterations_estimate = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_estimate + bias, + ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, assumed_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_estimate + bias, + : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, assumed_vf) - 1); if (dump_enabled_p ()) diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1381b5f..c8850d4 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -9991,3 +9991,16 @@ vect_gen_while (tree mask, tree start_index, tree end_index) gimple_call_set_lhs (call, mask); return call; } + +/* Generate a vector mask of type MASK_TYPE for which index I is false iff + J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */ + +tree +vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index, + tree end_index) +{ + tree tmp = make_ssa_name (mask_type); + gcall *call = vect_gen_while (tmp, start_index, end_index); + gimple_seq_add_stmt (seq, call); + return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp); +} diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 580c21e..c01bd9b 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -351,6 +351,12 @@ typedef struct _loop_vec_info : public vec_info { on inactive scalars. */ vec_loop_masks masks; + /* If we are using a loop mask to align memory addresses, this variable + contains the number of vector elements that we should skip in the + first iteration of the vector loop (i.e. the number of leading + elements that should be false in the first mask). */ + tree mask_skip_niters; + /* Type of the variables to use in the WHILE_ULT call for fully-masked loops. */ tree mask_compare_type; @@ -480,6 +486,7 @@ typedef struct _loop_vec_info : public vec_info { #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor #define LOOP_VINFO_MASKS(L) (L)->masks +#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters #define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type #define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask #define LOOP_VINFO_LOOP_NEST(L) (L)->loop_nest @@ -1230,6 +1237,17 @@ unlimited_cost_model (loop_p loop) return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED); } +/* Return true if the loop described by LOOP_VINFO is fully-masked and + if the first iteration should use a partial mask in order to achieve + alignment. */ + +static inline bool +vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo) +{ + return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); +} + /* Return the number of vectors of type VECTYPE that are needed to get NUNITS elements. NUNITS should be based on the vectorization factor, so it is always a known multiple of the number of elements in VECTYPE. */ @@ -1328,6 +1346,7 @@ extern void vect_loop_versioning (loop_vec_info, unsigned int, bool, poly_uint64); extern struct loop *vect_do_peeling (loop_vec_info, tree, tree, tree *, tree *, tree *, int, bool, bool); +extern void vect_prepare_for_masked_peels (loop_vec_info); extern source_location find_loop_location (struct loop *); extern bool vect_can_advance_ivs_p (loop_vec_info); @@ -1393,6 +1412,7 @@ extern tree vect_gen_perm_mask_any (tree, const vec_perm_indices &); extern tree vect_gen_perm_mask_checked (tree, const vec_perm_indices &); extern void optimize_mask_stores (struct loop*); extern gcall *vect_gen_while (tree, tree, tree); +extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree); /* In tree-vect-data-refs.c. */ extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);