From 419c5f99876d9ee517f6b646dd785cdcaf5cb6fe Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 3 May 2019 10:47:21 +0000 Subject: [PATCH] tree-vect-stmts.c (get_group_load_store_type): Avoid peeling for gaps by loading only lower halves of vectors if possible. 2019-05-03 Richard Biener * tree-vect-stmts.c (get_group_load_store_type): Avoid peeling for gaps by loading only lower halves of vectors if possible. (vectorizable_load): Likewise. * gcc.dg/vect/slp-reduc-sad-2.c: New testcase. From-SVN: r270847 --- gcc/ChangeLog | 7 ++++ gcc/testsuite/ChangeLog | 4 ++ gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c | 29 +++++++++++++++ gcc/tree-vect-stmts.c | 58 ++++++++++++++++++++++++++++- 4 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d57cd7c..8b34efa 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,12 @@ 2019-05-03 Richard Biener + * tree-vect-stmts.c (get_group_load_store_type): Avoid + peeling for gaps by loading only lower halves of vectors + if possible. + (vectorizable_load): Likewise. + +2019-05-03 Richard Biener + PR middle-end/89518 * match.pd: Add pattern to optimize (A / B) * B + (A % B) to A. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 74a33be..7ae100f 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2019-05-03 Richard Biener + * gcc.dg/vect/slp-reduc-sad-2.c: New testcase. + +2019-05-03 Richard Biener + PR middle-end/89518 * gcc.dg/pr89518.c: New testcase. diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c new file mode 100644 index 0000000..5179fcc --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_usad_char } */ +/* With AVX256 or more we do not pull off the trick eliding the epilogue. */ +/* { dg-additional-options "-mprefer-avx128" { target { x86_64-*-* i?86-*-* } } } */ + +typedef unsigned char uint8_t; +int x264_pixel_sad_8x8( uint8_t *pix1, uint8_t *pix2, int i_stride_pix2 ) +{ + int i_sum = 0; + for( int y = 0; y < 8; y++ ) + { + i_sum += __builtin_abs( pix1[0] - pix2[0] ); + i_sum += __builtin_abs( pix1[1] - pix2[1] ); + i_sum += __builtin_abs( pix1[2] - pix2[2] ); + i_sum += __builtin_abs( pix1[3] - pix2[3] ); + i_sum += __builtin_abs( pix1[4] - pix2[4] ); + i_sum += __builtin_abs( pix1[5] - pix2[5] ); + i_sum += __builtin_abs( pix1[6] - pix2[6] ); + i_sum += __builtin_abs( pix1[7] - pix2[7] ); + pix1 += 16; + pix2 += i_stride_pix2; + } + return i_sum; +} + +/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-not "access with gaps requires scalar epilogue loop" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 41a7eb0..247d435 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2258,6 +2258,29 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, && gap < (vect_known_alignment_in_bytes (first_dr_info) / vect_get_scalar_dr_size (first_dr_info))) overrun_p = false; + + /* If the gap splits the vector in half and the target + can do half-vector operations avoid the epilogue peeling + by simply loading half of the vector only. Usually + the construction with an upper zero half will be elided. */ + dr_alignment_support alignment_support_scheme; + scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); + machine_mode vmode; + if (overrun_p + && !masked_p + && (((alignment_support_scheme + = vect_supportable_dr_alignment (first_dr_info, false))) + == dr_aligned + || alignment_support_scheme == dr_unaligned_supported) + && known_eq (nunits, (group_size - gap) * 2) + && mode_for_vector (elmode, (group_size - gap)).exists (&vmode) + && VECTOR_MODE_P (vmode) + && targetm.vector_mode_supported_p (vmode) + && (convert_optab_handler (vec_init_optab, + TYPE_MODE (vectype), vmode) + != CODE_FOR_nothing)) + overrun_p = false; + if (overrun_p && !can_overrun_p) { if (dump_enabled_p ()) @@ -8516,8 +8539,24 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } else { + tree ltype = vectype; + /* If there's no peeling for gaps but we have a gap + with slp loads then load the lower half of the + vector only. See get_group_load_store_type for + when we apply this optimization. */ + if (slp + && loop_vinfo + && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + && DR_GROUP_GAP (first_stmt_info) != 0 + && known_eq (nunits, + (group_size + - DR_GROUP_GAP (first_stmt_info)) * 2)) + ltype = build_vector_type (TREE_TYPE (vectype), + (group_size + - DR_GROUP_GAP + (first_stmt_info))); data_ref - = fold_build2 (MEM_REF, vectype, dataref_ptr, + = fold_build2 (MEM_REF, ltype, dataref_ptr, dataref_offset ? dataref_offset : build_int_cst (ref_type, 0)); @@ -8531,6 +8570,23 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type)); + if (ltype != vectype) + { + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); + tree tem = make_ssa_name (ltype); + new_stmt = gimple_build_assign (tem, data_ref); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + data_ref = NULL; + vec *v; + vec_alloc (v, 2); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, + build_zero_cst (ltype)); + new_stmt + = gimple_build_assign (vec_dest, + build_constructor + (vectype, v)); + } } break; } -- 2.7.4