From 29afecdf1b14136640f1d88da0e10751dbc6283c Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 7 Nov 2016 08:06:08 +0000 Subject: [PATCH] re PR middle-end/37150 (basic-block vectorization misses some unrolled loops) 2016-11-07 Richard Biener PR tree-optimization/37150 * tree-vectorizer.h (vect_transform_slp_perm_load): Add n_perms parameter. * tree-vect-slp.c (vect_supported_load_permutation_p): Adjust. (vect_analyze_slp_cost_1): Account for the real number of permutations emitted and for dead loads. (vect_transform_slp_perm_load): Add n_perms parameter counting the number of emitted permutations. * tree-vect-stmts.c (vectorizable_load): Adjust. From-SVN: r241893 --- gcc/ChangeLog | 12 ++++++++++++ gcc/tree-vect-slp.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- gcc/tree-vect-stmts.c | 11 ++++++++--- gcc/tree-vectorizer.h | 2 +- 4 files changed, 61 insertions(+), 12 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 960ea67..7e5c970 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,17 @@ 2016-11-07 Richard Biener + PR tree-optimization/37150 + * tree-vectorizer.h (vect_transform_slp_perm_load): Add n_perms + parameter. + * tree-vect-slp.c (vect_supported_load_permutation_p): Adjust. + (vect_analyze_slp_cost_1): Account for the real number of + permutations emitted and for dead loads. + (vect_transform_slp_perm_load): Add n_perms parameter counting + the number of emitted permutations. + * tree-vect-stmts.c (vectorizable_load): Adjust. + +2016-11-07 Richard Biener + PR tree-optimization/78189 * tree-vect-data-refs.c (vect_compute_data_ref_alignment): Fix alignment computation. diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 62f060c..6694164 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1461,8 +1461,9 @@ vect_supported_load_permutation_p (slp_instance slp_instn) { /* Verify the permutation can be generated. */ vec tem; + unsigned n_perms; if (!vect_transform_slp_perm_load (node, tem, NULL, - 1, slp_instn, true)) + 1, slp_instn, true, &n_perms)) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -1475,11 +1476,13 @@ vect_supported_load_permutation_p (slp_instance slp_instn) } /* For loop vectorization verify we can generate the permutation. */ + unsigned n_perms; FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (node->load_permutation.exists () && !vect_transform_slp_perm_load (node, vNULL, NULL, - SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) + SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true, + &n_perms)) return false; return true; @@ -1548,14 +1551,38 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node, stmt = GROUP_FIRST_ELEMENT (stmt_info); stmt_info = vinfo_for_stmt (stmt); /* Record the cost for the permutation. */ - record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, + unsigned n_perms; + vect_transform_slp_perm_load (node, vNULL, NULL, + ncopies_for_cost, instance, true, + &n_perms); + record_stmt_cost (body_cost_vec, n_perms, vec_perm, stmt_info, 0, vect_body); - /* And adjust the number of loads performed. */ unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); - ncopies_for_cost - = (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) - + nunits - 1) / nunits; + /* And adjust the number of loads performed. This handles + redundancies as well as loads that are later dead. */ + auto_sbitmap perm (GROUP_SIZE (stmt_info)); + bitmap_clear (perm); + for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i) + bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]); + ncopies_for_cost = 0; + bool load_seen = false; + for (i = 0; i < GROUP_SIZE (stmt_info); ++i) + { + if (i % nunits == 0) + { + if (load_seen) + ncopies_for_cost++; + load_seen = false; + } + if (bitmap_bit_p (perm, i)) + load_seen = true; + } + if (load_seen) + ncopies_for_cost++; + gcc_assert (ncopies_for_cost + <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) + + nunits - 1) / nunits); ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance); } /* Record the cost for the vector loads. */ @@ -3402,7 +3429,8 @@ vect_create_mask_and_perm (gimple *stmt, bool vect_transform_slp_perm_load (slp_tree node, vec dr_chain, gimple_stmt_iterator *gsi, int vf, - slp_instance slp_node_instance, bool analyze_only) + slp_instance slp_node_instance, bool analyze_only, + unsigned *n_perms) { gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -3457,6 +3485,7 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, int first_vec_index = -1; int second_vec_index = -1; bool noop_p = true; + *n_perms = 0; for (int j = 0; j < unroll_factor; j++) { @@ -3513,6 +3542,9 @@ vect_transform_slp_perm_load (slp_tree node, vec dr_chain, return false; } + if (! noop_p) + ++*n_perms; + if (!analyze_only) { tree mask_vec = NULL_TREE; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1d17156..ab01def 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -6978,8 +6978,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } } if (slp_perm) - vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, - slp_node_instance, false); + { + unsigned n_perms; + vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, + slp_node_instance, false, &n_perms); + } return true; } @@ -7497,8 +7500,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (slp_perm) { + unsigned n_perms; if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, - slp_node_instance, false)) + slp_node_instance, false, + &n_perms)) { dr_chain.release (); return false; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 3866548..2a7cdfe 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1166,7 +1166,7 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, extern void vect_free_slp_instance (slp_instance); extern bool vect_transform_slp_perm_load (slp_tree, vec , gimple_stmt_iterator *, int, - slp_instance, bool); + slp_instance, bool, unsigned *); extern bool vect_slp_analyze_operations (vec slp_instances, void *); extern bool vect_schedule_slp (vec_info *); -- 2.7.4